float *x, *y; /* ========================================== Allocate arrays to store vector x and y ========================================== */ x = calloc( N, sizeof(float) ); y = calloc( N, sizeof(float) ); /* =============================================== CPU version of the vector addition algorithm =============================================== */ for (i = 0; i < N; i++) // Add N elements... y[i] = x[i] + y[i]; // Add one element at a time... |
How the program code progresses in time:
|
cd /home/cs355001/demo/CUDA/3-add-vector/cpu-add-vector.c nvcc -g cpu-add-vector.c -o cpu-add-vector cpu-add-vector 100000 Elasped time = 611 micro secs cpu-add-vector 10000000 Elasped time = 41079 micro secs |
|
/* ================================================== "Serial" version of the vector addition algorithm ================================================== */ for (i = 0; i < N; i++) y[i] = x[i] + y[i]; // Add 1 element at a time |
__global__ void add(int n, float *x, float *y) { for (int i = 0; i < n; i++) // Same code as CPU version y[i] = x[i] + y[i]; } int main(int argc, char *argv[]) { ... initialization code omitted .... // ======================================================================= // Run "add" kernel on on the GPU using 1 block, 1 thread/per block add<<<1, 1>>>(N, x, y); // Call CPU function // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Wait for GPU to finish // ======================================================================= ... print code omitted ... } |
How the program code progresses in time:
|
file: /home/cs355001/demo/CUDA/3-add-vector/add-vector1.cu Compile: nvcc -o add-vector1 add-vector1.cu Sample runs: cs355@ghost01 (1457)> add-vector1 100000 Elasped time = 18492 micro secs cs355@ghost01 (1458)> add-vector1 10000000 Elasped time = 1604292 micro secs |
Conclusion:
|
|
int main(int argc, char *argv[]) { ... initialization code omitted ... // ================================================================== // Run kernel on the GPU using 1 block, K thread/per block add<<<1, K>>>(N, x, y); // Spawn K GPU threads // Wait for GPU to finish before accessing on host cudaDeviceSynchronize(); // Wait for GPU to finish // ================================================================== ... print code omitted ... } |
Program file: /home/cs355001/demo/CUDA/3-add-vector/vector-add2.cu Compile: nvcc -o add-vector2 add-vector2.cu Sample runs: cs355@ghost01 (1475)> add-vector2 10000000 1 (1 thread) Elasped time = 2106604 micro secs cs355@ghost01 (1479)> add-vector2 10000000 10 (10 threads !!) Elasped time = 328381 micro secs cs355@ghost01 (1480)> add-vector2 10000000 100 Elasped time = 56360 micro secs |
|
__global__ void add(int n, float *x, float *y) { int i = blockIdx.x*blockDim.x + threadIdx.x; if ( i < n ) // NO LOOP !!! { y[i] = x[i] + y[i]; // Work load for thread i !!! } } |
Program file: /home/cs355001/demo/CUDA/3-add-vector/vector-add3.cu Compile: nvcc -o add-vector3 add-vector3.cu Sample runs: cs355@ghost01 (1475)> add-vector3 10000000 1 (1 thread/blk) User specified to use K=1 threads/block N = 10000000/1 = 10000000.000000 ---> we must use 10000000 blocks Elasped time = 92493 micro secs cs355@ghost01 (1751)> add-vector3 10000000 32 (32 threads/blk) User specified to use K=32 threads/block N = 10000000/32 = 312500.000000 ---> we must use 312500 blocks Elasped time = 21408 micro secs cs355@ghost01 (1752)> add-vector3 10000000 64 User specified to use K=64 threads/block N = 10000000/64 = 156250.000000 ---> we must use 156250 blocks Elasped time = 20970 micro secs |