#include <omp.h> #pragma omp parallel [Options...] { ... ... Parallel region ... ... Program statements between the braces ... are executed in parallel by all threads ... } |
export OMP_NUM_THREADS=... Example: export OMP_NUM_THREADS=8 |
|
#include <opm.h> int main(int argc, char *argv[]) { #pragma omp parallel { cout << "Hello World !!!" << endl; } } |
export OMP_NUM_THREADS=8
a.out
You will see "Hello World !!!" printed EIGHT times !!! (Remove the #pragma line and you get ONE line)....
![]() |
Execution:
|
|
|
#include <omp.h> int main(int argc, char *argv[]) { int N; // Variable defined OUTSIDE parallel region.... // It is therefore SHARED N = 1001; cout << "Before parallel section: N = " << N << endl; #pragma omp parallel { N = N + 1; cout << "Inside parallel section: N = " << N << endl; } cout << "After parallel section: N = " << N << endl; } |
You should see the value for N at the end is not always 1009, it could be less. You have seen this phenomenon before in threaded programs when multiple-threads update a shared variable concurrently... The same things is happening here.
#include <omp.h> int main(int argc, char *argv[]) { #pragma omp parallel { int N; // Variable defined INSIDE parallel region.... // It is therefore NON-SHARED N = 1001; N = N + 1; } // ERROR if you try to do this: // cout << "N = " << N << endl; // because N is not defined in the outer scope !!! } |
You should see the value for N at the end is always 1002
#include <omp.h> int main(int argc, char *argv[]) { int N; // Line XXX N = 1001; cout << "Before parallel section: N = " << N << endl; #pragma omp parallel private(N) { // Define a local variable N ! N = N + 1; // This N is different from the N at line XXX !!! cout << "Inside parallel section: N = " << N << endl; } cout << "After parallel section: N = " << N << endl; } |
You should see the value for N inside the parallel section
is always 1
The variable N outside the parallel section
remains 1001
Function Name | Effect |
---|---|
omp_set_num_threads(int nthread) | Set size of thread team |
INTEGER omp_get_num_threads() | return size of thread team |
INTEGER omp_get_max_threads() | return max size of thread team (typically equal to the number of processors |
INTEGER omp_get_thread_num() | return thread ID of the thread that calls this function |
INTEGER omp_get_num_procs() | return number of processors |
LOGICAL omp_in_parallel() | return TRUE if currently in a PARALLEL segment |
omp_init_lock(omp_lock_t *lock) | Initialize the mutex lock "lock" |
omp_set_lock(omp_lock_t *lock) | Lock the mutex lock "lock" |
omp_unset_lock(omp_lock_t *lock) | Unlock the mutex lock "lock" |
omp_test_lock(omp_lock_t *lock) | Return true if the mutex lock "lock" is locked, returns false otherwise |
NOTE: We will study other synchronization primitives and will not discuss omp..lock()
#include <iostream.h> #include <omp.h> // Read in OpenMP function prototypes int main(int argc, char *argv[]) { int nthreads, myid; #pragma omp parallel private (nthreads, myid) { /* Every thread does this */ myid = omp_get_thread_num(); cout << "Hello I am thread " << myid << endl; /* Only thread 0 does this */ if (myid == 0) { nthreads = omp_get_num_threads(); cout << "Number of threads = " << nthreads << endl; } } return 0; } |
/* Shared Variables */ double x[1000000]; // Must be SHARED (accessed by worker threads !!) int start[100]; // Contain starting array index of each thread double min[100]; // Contain the minimum found by each thread int num_threads; int main(...) { for (i = 0; i < MAX; i++) x[i] = random()/(double)1147483648; // ---------------------------- Start parallel ----- #pragma omp parallel { ... Thread i finds its minimum and ... store the result in min[i] } // ---------------------------- End parallel ----- // ---------------------------------------- // Post processing: Find actual minimum // ---------------------------------------- my_min = min[0]; for (i = 1; i < num_threads; i++) if ( min[i] < my_min ) my_min = min[i]; } |
(For simplicity of discussion, I used 2 threads)
|
start[0] start[1] | | | values handled by | values handled by V thread 0 V thread 1 |<--------------------->|<--------------------->| |
#define MAX 1000000 /* Shared Variables */ double x[MAX]; // Must be SHARED (accessed by worker threads !!) int start[100]; // Contain starting array index of each thread double min[100]; // Contain the minimum found by each thread int num_threads; int main(...) { for (i = 0; i < MAX; i++) x[i] = random()/(double)1147483648; // ---------------------------- Start parallel ----- #pragma omp parallel { int id; int i, n, start, stop; double my_min; n = MAX/omp_get_num_threads(); // step = MAX/number of threads. id = omp_get_thread_num(); // id is one of 0, 1, ..., (num_threads-1) /* ---------------------------- Find the starting index ---------------------------- */ start = id * n; /* ---------------------------- Find the stopping index ---------------------------- */ if ( id != (num_threads-1) ) { stop = start + n; } else { stop = MAX; } /* ------------------------------------------ Find the min between x[start] and x[stop] ------------------------------------------ */ my_min = x[start]; for (i = start+1; i < stop; i++ ) { if ( x[i] < my_min ) my_min = x[i]; } /* ---------------------------- Save result in shared area ---------------------------- */ min[id] = my_min; // Store result in min[id] } // ---------------------------- End parallel ----- // ---------------------------------------- // Post processing: Find actual minimum // ---------------------------------------- my_min = min[0]; for (i = 1; i < num_threads; i++) if ( min[i] < my_min ) my_min = min[i]; } |
Compile with:
Run with (on compute):
export OMP_NUM_THREADS=8
a.out
I will limit to the most commonly used one: mutual exclusion.
#pragma omp critical { ... ... Mutual exclusive access to ... shared variables ... } |
int N; // Global - shared by all threads int main(...) { .... /* ------------------- Parallel section ------------------- */ #pragma omp parallel { .... /* --------------------------------------- Section with mutual exclussive access --------------------------------------- */ #pragma omp critical { N = N + 1; } .... } ... } |
double f(double a) { return( 2.0 / sqrt(1 - a*a) ); } int main(int argc, char *argv[]) { int i; int N; double sum; double x, w; N = ...; // accuracy of the approximation w = 1.0/N; sum = 0.0; for (i = 1; i <= N; i = i + 1) { x = w*(i - 0.5); sum = sum + w*f(x); } cout << sum; } |
Compile with:
Run the program with:
values handled by thread 0 | | | | | | | | | | | | | | V V V V V V V V V V V V V V |-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ | | | | | | | | | | | | | | values handled by thread 1 |
|
double f(double a) { return( 2.0 / sqrt(1 - a*a) ); } int main(int argc, char *argv[]) { int N; double x, w; double sum; // Shared variable, updated ! N = ...; // accuracy of the approximation w = 1.0/N; sum = 0.0; #pragma omp parallel { int i, num_threads; // Non-shared variables !!! double x; num_threads = omp_get_num_threads() ; for (i = omp_get_thread_num(); i < N; i = i + num_threads) { x = w*(i + 0.5); #pragma omp critical { sum = sum + w*f(x); } } } cout << sum; } |
|
|
Change OMP_NUM_THREADS and see the difference in performance
|
double f(double a) { return( 2.0 / sqrt(1 - a*a) ); } int main(int argc, char *argv[]) { int N; double sum; // Shared variable ! double x, w; N = ...; // accuracy of the approximation w = 1.0/N; sum = 0.0; #pragma omp parallel { int i, num_threads; double x; double mypi; // Private variable to reduce synchronization num_threads = omp_get_num_threads() ; mypi = 0.0; for (i = omp_get_thread_num(); i < N; i = i + num_threads) { x = w*(i + 0.5); mypi = mypi + w*f(x); // No synchronization needed ! } #pragma omp critical { sum = sum + mypi; // Synchronize outside loop ! } } cout << sum; } |
|
|
Change OMP_NUM_THREADS ` and see the difference in performance
|
The division of labor (splitting the work of a for-loop) in a parallel for-loop can be done automatically in OpenMP through the PARALLEL LOOP construct.
#pragma omp for [parameters] for-statement // Parallel Loop |
|
|
double f(double a) { return( 2.0 / sqrt(1 - a*a) ); } int main(int argc, char *argv[]) { int N; double sum; // Shared variable, updated ! double x, w; N = ...; // accuracy of the approximation w = 1.0/N; sum = 0.0; #pragma omp parallel { int i; double x; double mypi; // Non-shared within the parallel section mypi = 0.0; /* -------------------------- PARALLEL FOR construct -------------------------- */ #pragma omp for for (i = 0; i < N; i = i + 1) { x = w*(i + 0.5); // Save us the trouble of dividing mypi = mypi + w*f(x); // the work up... } #pragma omp critical { sum = sum + mypi; } } cout << sum; } |
Comment:
|
export OMP_NUM_THREADS=8
a.out 50000000
Change OMP_NUM_THREADS and see the difference in performance
setenv STACKSIZE nBytes |