CUDA programming before the introduction of Unified Memory

  • For completeness sake, I like to discuss how to write CUDA programs without using Unified Memory

  • Recall: the host memory and device memory are separate memory systems:

  • CUDA programs before the introduction of Unified Memory:

    • Uses the host function cudaMemcopy( ) to transfer data between CPU (host) memory and GPU (device) memory

Allocating host (CPU) memory and device (GPU) memory

  • Before we can perform computations:

    • a program must first allocate memory to store its data in variables


  • How to allocate variable variables in the host (CPU) memory:

      dataType *x = (dataType *) malloc( NBytes ); // CS255 material 
    
      Example: allocate an array of 1000 doubles in CPU memory:
    
        double *x = (double *) malloc( 1000*sizeof(double) );
    

  • How to allocate variable variables in the device (GPU) memory:

      dataType *d_x;   // Device variables are prefixed with d_
    
      cudaMalloc(&d_x, NBytes ) ;
    
      Example: allocate an array of 1000 doubles in GPU memory:
    
        double *d_x;
        cudaMalloc( &d_x, 1000*sizeof(double) );
    

Transfer data between host (CPU) memory and device (GPU) memory

  • The cudaMemcopy( ) function:

     cudaMemcpy(to, from, Nbytes, MODE):
    
     if MODE = cudaMemcpyHostToDevice:
    
       transfers Nbytes from host memory to device memory
    
     if MODE = cudaMemcpyDeviceToHost:
    
       transfers Nbytes from device memory to host memory
    

  • How to use cudaMemcopy( ) with GPU kernel functions:

      (1) Transfer input data from host memory to device memory:
    
          cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
    
      (2) Execute the kernel function on the device data:
    
          vectorAdd<<<B, T>>>(d_x, d_y, d_z, N);
    
      (3) Transfer output data from device memory to host memory:
    
          cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    
    

The host program of Vector Addition without using Unified Memory

  • The main( ) function that lauches the VectorAdd( ):

    int main(int argc, char *argv[])
    {
      int N = Vector size
    
      float *x, *y, *z;          // Host arrays
    
      x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
      y = (float *)malloc(N*sizeof(float));
      z = (float *)malloc(N*sizeof(float));
    
      float *d_x, *d_y, *d_z;    // Device arrays
    
      cudaMalloc(&d_x, N*sizeof(float));    // Allocate device arrays
      cudaMalloc(&d_y, N*sizeof(float));
      cudaMalloc(&d_z, N*sizeof(float));
    
      // Transfer data  host memory ---> device memory
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
      int T = # threads per thread block (user choice)
      int B = ceil( (float) N / T );            // # thread blocks needed 
      vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
    
      // Transfer data  host memory <--- device memory
      cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    }
    

The host program of Vector Addition without using Unified Memory

  • Allocate the host array variables to store the vectores:

    int main(int argc, char *argv[])
    {
      int N = Vector size
    
      float *x, *y, *z;          // Host arrays
    
      x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
      y = (float *)malloc(N*sizeof(float));
      z = (float *)malloc(N*sizeof(float));
    
      Assume that the arrays have been initialized...
    
      cudaMalloc(&d_x, N*sizeof(float));    // Allocate device arrays
      cudaMalloc(&d_y, N*sizeof(float));
      cudaMalloc(&d_z, N*sizeof(float));
    
      // Transfer data  host memory ---> device memory
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
      int T = # threads per thread block (user choice)
      int B = ceil( (float) N / T );            // # thread blocks needed 
      vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
    
      // Transfer data  host memory <--- device memory
      cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    }
    

The host program of Vector Addition without using Unified Memory

  • Allocate the device array variables to store the vectores:

    int main(int argc, char *argv[])
    {
      int N = Vector size
    
      float *x, *y, *z;          // Host arrays
    
      x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
      y = (float *)malloc(N*sizeof(float));
      z = (float *)malloc(N*sizeof(float));
    
      float *d_x, *d_y, *d_z;    // Device arrays
    
      cudaMalloc(&d_x, N*sizeof(float));    // Allocate device arrays
      cudaMalloc(&d_y, N*sizeof(float));
      cudaMalloc(&d_z, N*sizeof(float));
    
      // Transfer data  host memory ---> device memory
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
      int T = # threads per thread block (user choice)
      int B = ceil( (float) N / T );            // # thread blocks needed 
      vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
    
      // Transfer data  host memory <--- device memory
      cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    }
    

The host program of Vector Addition without using Unified Memory

  • Transfer the input data from host (CPU) variables to the device (GPU) variables

    int main(int argc, char *argv[])
    {
      int N = Vector size
    
      float *x, *y, *z;          // Host arrays
    
      x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
      y = (float *)malloc(N*sizeof(float));
      z = (float *)malloc(N*sizeof(float));
    
      float *d_x, *d_y, *d_z;    // Device arrays
    
      cudaMalloc(&d_x, N*sizeof(float));    // Allocate device arrays
      cudaMalloc(&d_y, N*sizeof(float));
      cudaMalloc(&d_z, N*sizeof(float));
    
      // Transfer data  host memory ---> device memory
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
      int T = # threads per thread block (user choice)
      int B = ceil( (float) N / T );            // # thread blocks needed 
      vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
    
      // Transfer data  host memory <--- device memory
      cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    }
    

The host program of Vector Addition without using Unified Memory

  • Launch the vectorAdd( ) kernel function on the device (GPU) variables:

    int main(int argc, char *argv[])
    {
      int N = Vector size
    
      float *x, *y, *z;          // Host arrays
    
      x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
      y = (float *)malloc(N*sizeof(float));
      z = (float *)malloc(N*sizeof(float));
    
      float *d_x, *d_y, *d_z;    // Device arrays
    
      cudaMalloc(&d_x, N*sizeof(float));    // Allocate device arrays
      cudaMalloc(&d_y, N*sizeof(float));
      cudaMalloc(&d_z, N*sizeof(float));
    
      // Transfer data  host memory ---> device memory
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
      int T = # threads per thread block (user choice)
      int B = ceil( (float) N / T );            // # thread blocks needed 
      vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
      cudaDeviceSynchronize();
      // Transfer data  host memory <--- device memory
      cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    }
    

The host program of Vector Addition without using Unified Memory

  • Transfer the result from device (GPU) variable to the host (CPU) variable

    int main(int argc, char *argv[])
    {
      int N = Vector size
    
      float *x, *y, *z;          // Host arrays
    
      x = (float *)malloc(N*sizeof(float)); // Allocate host arrays
      y = (float *)malloc(N*sizeof(float));
      z = (float *)malloc(N*sizeof(float));
    
      float *d_x, *d_y, *d_z;    // Device arrays
    
      cudaMalloc(&d_x, N*sizeof(float));    // Allocate device arrays
      cudaMalloc(&d_y, N*sizeof(float));
      cudaMalloc(&d_z, N*sizeof(float));
    
      // Transfer data  host memory ---> device memory
      cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
    
      int T = # threads per thread block (user choice)
      int B = ceil( (float) N / T );            // # thread blocks needed 
      vectorAdd<<<B, T>>>(d_x, d_y, d_z, N); // Launch kernel
      cudaDeviceSynchronize();
      // Transfer data  host memory <--- device memory
      cudaMemcpy(z, d_z, N*sizeof(float), cudaMemcpyDeviceToHost);
    }
    

DEMO: demo/CUDA/3-add-vector/gpu-add-vector2.cu