Review: approximating π using the Rectangle Rule

The single-threaded Rectangle Rule algorithm in C used to approximate π:

double f(double x) { return( 2.0 / sqrt(1 - x*x) ); // Used to approximate pi } int main(int argc, char *argv[]) { int N; double w, sum; N = ...; // N will determine the accuracy of the approximation w = 1.0/N; // a = 0, b = 1, so: (b-a) = 1.0 sum = 0.0; double x; for (int i = 0; i < N; i++) { x = (i + 0.5) * w; sum = sum + w*f(x); } printf("Pi = %lf", sum); }

DEMO: demo/OpenMP/compute-pi.c

Parallelizing the Rectangle Rule Algorithm

We write a OpenMP parallel version of the Rectangle Rule with work load distribution:

Work load for 2 threads: values handled by thread 0 | | | | | | | | | | | | | | V V V V V V V V V V V V V V |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ | | | | | | | | | | | | | | values handled by thread 1
In general: Work load for thread "id": id id+nThreads id+2*nThreads ... | | | | V V V V |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|

Parallelizing the Rectangle Rule Algorithm

This is the single-threaded Rectangle-Rule algorithm:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; #pragma omp parallel { int id = omp_get_thread_num(); int nThreads = omp_get_num_threads() ; double x; for (int i = 0; i < N; i = i + 1 ) { x = w*(i + 0.5); sum = sum + w*f(x); } } printf("\nPi = %lf\n\n", sum); }

Parallelizing the Rectangle Rule Algorithm

We create a parallel region to execute the loop:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; #pragma omp parallel { int id = omp_get_thread_num(); int nThreads = omp_get_num_threads() ; double x; for (int i = 0; i < N; i = i + 1 ) { x = w*(i + 0.5); sum = sum + w*f(x); } } printf("\nPi = %lf\n\n", sum); }

Parallelizing the Rectangle Rule Algorithm

Find the thread's ID and the # threads of the thread (needed to distribute the work load):

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); // # threads double x; for (int i = 0; i < N; i = i + 1 ) { x = w*(i + 0.5); sum = sum + w*f(x); } } printf("\nPi = %lf\n\n", sum); }

Parallelizing the Rectangle Rule Algorithm

Assign/distribute the work load to each thread:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); sum = sum + w*f(x); } } printf("\nPi = %lf\n\n", sum); }

DEMO: demo/OpenMP/openMP-compute-pi1.c --- has synchronization errors

Synchronizing updates with a mutex lock in OpenMP

Updates to the shared variable sum must be synchronized:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; // Shared variable among the threads #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); sum = sum + w*f(x); // Update to shared variable } } printf("\nPi = %lf\n\n", sum); }

Synchronizing updates with a mutex lock in OpenMP

Create and initialize a OpenMP lock variable sumLock:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; // Shared variable among the threads omp_lock_t sumLock; // Create lock variable omp_init_lock(&sumLock); // Initialize lock variable #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); sum = sum + w*f(x); // Update to shared variable } } printf("\nPi = %lf\n\n", sum); }

Synchronizing updates with a mutex lock in OpenMP

Use the sumLock variable the synchronize the updates:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; // Shared variable among the threads omp_lock_t sumLock; // Create lock variable omp_init_lock(&sumLock); // Initialize lock variable #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); omp_set_lock(&sumLock); // Lock the mutex sum = sum + w*f(x); // Update to shared variable omp_unset_lock(&sumLock); // Unlock the mutex } } printf("\nPi = %lf\n\n", sum); }

DEMO: demo/OpenMP/openMP-compute-pi2.c --- has a synchronization bottleneck

Use a private (local) variable to avoid synchronization bottleneck

Use a private variable mySum to avoid the synchronization bottleneck:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; // Shared variable among the threads omp_lock_t sumLock; // Create lock variable omp_init_lock(&sumLock); // Initialize lock variable #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double mySum = 0; double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); mySum = mySum + w*f(x); // Update to private variable } omp_set_lock(&sumLock); // Lock the mutex sum = sum + mySum; // Update to shared variable omp_unset_lock(&sumLock); // Unlock the mutex } printf("\nPi = %lf\n\n", sum); }

DEMO: demo/OpenMP/openMP-compute-pi3.c

Alternate way to synchronizing updates with a mutex lock in OpenMP

Alternate way to synchronized updates in OpenMP: the critical region

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; // Shared variable among the threads #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double mySum = 0; double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); mySum = mySum + w*f(x); // Update to private variable } critical region sum = sum + mySum; // Update to shared variable critical region } printf("\nPi = %lf\n\n", sum); }

Alternate way to synchronizing updates with a mutex lock in OpenMP

Statements inside a critical region are executed by one (1) thread only:

int main(int argc, char *argv[]) { int N = ....; // Assume N is initialized double sum; double w; w = 1.0/(double) N; sum = 0.0; // Shared variable among the threads #pragma omp parallel { int id = omp_get_thread_num(); // Thread's ID int nThreads = omp_get_num_threads(); double mySum = 0; double x; for (int i = id; i < N; i = i + nThreads) { x = w*(i + 0.5); mySum = mySum + w*f(x); // Update to private variable } #pragma critical { sum = sum + mySum; // Update to shared variable } } printf("\nPi = %lf\n\n", sum); }

DEMO: demo/OpenMP/openMP-compute-pi4.c