- The main( ) function
that lauches the
VectorAdd( ):
int main(int argc, char *argv[])
{
int N = Vector size
float *x, *y, *z; // Host arrays
x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
y = (float *)malloc(N*sizeof(float));
z = (float *)malloc(N*sizeof(float));
float *d_x, *d_y, *d_z; // Device arrays
cudaMalloc(&d_x, N*sizeof(float)); // Allocate device arrays
cudaMalloc(&d_y, N*sizeof(float));
cudaMalloc(&d_z, N*sizeof(float));
// Transfer data host memory ---> device memory
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
int T = # threads per thread block (user choice)
int B = ceil( (float) N / T ); // # thread blocks needed
vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
// Transfer data host memory <--- device memory
cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
}
|
|