|
Example:
|
|
Example:
C program statement Machine instructions ===================== ====================== x = x + 1 --> ldr R0, x add R0, R0, #1 str R0,x |
2 threads executes: x = x + 1 Suppose x = 4 Thread 1 Thread 2 ============ =========== ldr R0,x (Thread 1's R0 = 4) ldr R0,x (Thread 2's R0 = 4) add R0,R0,#1 (Thread 1's R0 = 5) add R0,R0,#1 (Thread 2's R0 = 5) str R0,x (Thread 1 set x= 5) str R0,x (Thread 2 set x= 5) |
Example CUDA program that illustrates the above effect:
__global__ void addOne(int *x) { printf("Thread %d: Adding 1 to x\n", threadIdx.x); *x = *x + 1; } int main(int argc, char *argv[]) { int *x; /* ==================================== Allocate shared local variable x ==================================== */ cudaMallocManaged(&x, sizeof(int)); *x = 0; // Set *x = 0 printf("Main before addOne(): *x = %d\n", *x); // Show *x before calling addOne( ) // ================================================================== // Run kernel on the GPU using 1 block, 4 thread/per block addOne<<<1, 4>>>(x); // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // ================================================================== printf("Main after addOne(): *x = %d\n", *x); // Show *x after calling addOne( ) } |
Program: /home/cs355001/demo/CUDA/5-add-common-var/add-common1.cu cs355@ghost01 (2112)> add-common1 5 Main: x = 0 Thread 0: Adding 1 to x Thread 1: Adding 1 to x Thread 2: Adding 1 to x Thread 3: Adding 1 to x Thread 4: Adding 1 to x Main: x = 1 <---- !!! Only got added 1 !!! |
|
int atomicAdd( int* address, int value ); unsigned int atomicAdd(unsigned int* address, unsigned int val); unsigned long long int atomicAdd(unsigned long long int* address, unsigned long long int val); float atomicAdd(float* address, float val); double atomicAdd(double* address, double val); |
Adds value to the variable *address and returns the old value of the variable
Example:
int *x; atomicSub( x, 1 ): subtract 1 from x and returns the old value in x |
Example:
int *x; atomicExch( x, 1 ): set x = 1 and returns the old value in x |
__global__
void add(int *x)
{
printf("Thread %d: Adding 1 to x\n", threadIdx.x);
atomicAdd(x, 1);
}
int main(int argc, char *argv[])
{
int *x;
/* ====================================
Allocate shared local variable x
==================================== */
cudaMallocManaged(&x, sizeof(int));
*x = 0; // Set x = 0
printf("Main: x = %d\n", *x); // Show x before calling add( )
// ==================================================================
// Run kernel on the GPU using NBlks block, K thread/per block
add<<<1, N>>>(x);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// ==================================================================
printf("Main: x = %d\n", *x); // Show x after calling add( )
}
|
Program: /home/cs355001/demo/CUDA/5-add-common-var/add-common2.cu cs355@ghost01 (2118)> add-common2 5 Main: x = 0 Thread 0: Adding 1 to x Thread 1: Adding 1 to x Thread 2: Adding 1 to x Thread 3: Adding 1 to x Thread 4: Adding 1 to x Main: x = 5 <---- correct !!! |