|
|
Example:
Input: A = 1 2 3 4 Output: A = 1 3 6 10 |
/* ------------------------------------------------ Reduce: input: a[0] a[1] a[2] ... output: a[0] = a[0] a[1] = a[0]+a[1] a[2] = a[0]+a[1]+a[2] ... ------------------------------------------------ */ void reduce(int a[], int n) { int i; for (i = 1; i < n; i++) a[i] = a[i] + a[i-1]; } |
|
Program file: /home/cs355001/demo/CUDA/6-reduction/cpu-array-reduce.c cs355@ghost01 (2655)> cpu-array-reduce 10 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 |
Suppose we have an array of 8 elements:
We assign 1 thread (threadID i) to compute the final value of a[i]:
|
__global__ void reduce(int *a, int n) { int myID = blockDim.x*blockIdx.x + threadIdx.x; if ( myID >= 1 && myID < n) a[myID] += a[myID-1]; if ( myID >= 2 && myID < n) a[myID] += a[myID-2]; if ( myID >= 4 && myID < n) a[myID] += a[myID-4]; if ( myID >= 8 && myID < n) a[myID] += a[myID-8]; if ( myID >= 16 && myID < n) a[myID] += a[myID-16]; if ( myID >= 32 && myID < n) a[myID] += a[myID-32]; if ( myID >= 64 && myID < n) a[myID] += a[myID-64]; if ( myID >= 128 && myID < n) a[myID] += a[myID-128]; if ( myID >= 256 && myID < n) a[myID] += a[myID-256]; } int main (int argc, char *argv[]) { N = 512; int K = 512; int NBlks = ceil((float) N / K ); reduce<< |
When you run it, you will have problems (the problem does not show up for array of < 512 elements -- the kernel must run long enough so they diverge in speed)
Program: /home/cs355001/demo/CUDA/6-reduction/array-reduce1.cu (The input of array A = 1 1 1 1 1 1 ... The correct output is: 1 2 3 4 5 6 7 .... 512 Sometimes you will get the correct output. But other times: 1 2 3 ... 474 477 478 479 480 |
|
|
__syncthreads( ); |
causes a thread to pause (wait) until all threads in the same thread block have executed the __syncthreads( ) method
|
__global__ void reduce(int *a, int n) { int myID = blockDim.x*blockIdx.x + threadIdx.x; if ( myID >= 1 && myID < n) a[myID] += a[myID-1]; __syncthreads( ); if ( myID >= 2 && myID < n) a[myID] += a[myID-2]; __syncthreads( ); if ( myID >= 4 && myID < n) a[myID] += a[myID-4]; __syncthreads( ); if ( myID >= 8 && myID < n) a[myID] += a[myID-8]; __syncthreads( ); .... } |
Program: /home/cs355001/demo/CUDA/6-reduction/array-reduce2.cu (The input of array A = 1 1 1 1 1 1 ... Now you will always get the correct output: 1 2 3 4 5 6 7 .... 512) |