|
|
Example:
Input:
A = 1 2 3 4
Output:
A = 1 3 6 10
|
/* ------------------------------------------------
Reduce:
input: a[0] a[1] a[2] ...
output: a[0] = a[0]
a[1] = a[0]+a[1]
a[2] = a[0]+a[1]+a[2]
...
------------------------------------------------ */
void reduce(int a[], int n)
{
int i;
for (i = 1; i < n; i++)
a[i] = a[i] + a[i-1];
}
|
|
Program file: /home/cs355001/demo/CUDA/6-reduction/cpu-array-reduce.c cs355@ghost01 (2655)> cpu-array-reduce 10 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 |
Suppose we have an array of 8 elements:
|
We assign 1 thread (threadID i) to compute the final value of a[i]:
|
|
__global__ void reduce(int *a, int n)
{
int myID = blockDim.x*blockIdx.x + threadIdx.x;
if ( myID >= 1 && myID < n)
a[myID] += a[myID-1];
if ( myID >= 2 && myID < n)
a[myID] += a[myID-2];
if ( myID >= 4 && myID < n)
a[myID] += a[myID-4];
if ( myID >= 8 && myID < n)
a[myID] += a[myID-8];
if ( myID >= 16 && myID < n)
a[myID] += a[myID-16];
if ( myID >= 32 && myID < n)
a[myID] += a[myID-32];
if ( myID >= 64 && myID < n)
a[myID] += a[myID-64];
if ( myID >= 128 && myID < n)
a[myID] += a[myID-128];
if ( myID >= 256 && myID < n)
a[myID] += a[myID-256];
}
int main (int argc, char *argv[])
{
N = 512;
int K = 512;
int NBlks = ceil((float) N / K );
reduce<<
|
When you run it, you will have problems (the problem does not show up for array of < 512 elements -- the kernel must run long enough so they diverge in speed)
Program: /home/cs355001/demo/CUDA/6-reduction/array-reduce1.cu
(The input of array A = 1 1 1 1 1 1 ...
The correct output is: 1 2 3 4 5 6 7 .... 512
Sometimes you will get the correct output.
But other times:
1 2 3 ... 474 477 478 479 480
|
|
|
__syncthreads( );
|
causes a thread to pause (wait) until all threads in the same thread block have executed the __syncthreads( ) method
|
__global__ void reduce(int *a, int n)
{
int myID = blockDim.x*blockIdx.x + threadIdx.x;
if ( myID >= 1 && myID < n)
a[myID] += a[myID-1];
__syncthreads( );
if ( myID >= 2 && myID < n)
a[myID] += a[myID-2];
__syncthreads( );
if ( myID >= 4 && myID < n)
a[myID] += a[myID-4];
__syncthreads( );
if ( myID >= 8 && myID < n)
a[myID] += a[myID-8];
__syncthreads( );
....
}
|
Program: /home/cs355001/demo/CUDA/6-reduction/array-reduce2.cu (The input of array A = 1 1 1 1 1 1 ... Now you will always get the correct output: 1 2 3 4 5 6 7 .... 512) |