Recall:   The kinds of variables that can be managed by the Unified Memory subsystem

The Unified Memory technique can manage the following kinds of variables:

  1. Global variables qualified with __managed__

    Examples:

        __managed__ int x;            // In last example
      
        __managed__ float A[100];     // one-dim array
      
        __managed__ double B[10][10]; // 2-dim array    
      

  2. (Dynamically) allocate variables

      • The CUDA library function cudaMallocManaged( ) can (dynamically) allocate variables that are managed by the Unified Memory method

The cudaMallocManaged( ) library function (CUDA doc)
 

  • Syntax:

        dataTypeX   *A;
        int          N;
      
        cudaMallocManaged( &A,  N*sizeof( dataTypeX ) );  
      

    will:

      1. Create (= allocate memory for) an array of type dataTypeX of size N

      2. Store the base address of the allocated array in the (reference) variable A

I will show you an example on how to use cudaMallowManaged( ) next.

Example: sharing a cudeMallocManaged array between CPU and GPU

Recall the example: sharing a global array between the CPU and the GPU:

__managed__ int x[10]; // Defines global shared array variable

__global__ void GPU_func( )
{
   printf("++ GPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}

int main()
{
   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

DEMO: /home/cs355001/demo/CUDA/2-unified-mem/shared-global-array.cu

Example: sharing a cudeMallocManaged array between CPU and GPU

I will turn the __managed__ global array into a cudeMallocManaged array:

__managed__ int x[10]; // Defines global shared array variable

__global__ void GPU_func( )
{
   printf("++ GPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}

int main()
{
   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Example: sharing a cudeMallocManaged array between CPU and GPU

We do not use a global array variable - so delete it:

__global__ void GPU_func( )
{
   printf("++ GPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}

int main()
{
   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Example: sharing a cudeMallocManaged array between CPU and GPU

We create a cudaMallocManaged array variable - I used the same array name (x) !!:

__global__ void GPU_func( )
{
   printf("++ GPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}

int main()
{
   int   *x;                                   
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Example: sharing a cudeMallocManaged array between CPU and GPU

We pass the array x and the array size (10) as parameters to to kernel call:

__global__ void GPU_func( )
{
   printf("++ GPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}

int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Example: sharing a cudeMallocManaged array between CPU and GPU

We will need to rewrite the kernel function with the parameter variables:

__global__ void GPU_func(int A[ ], in N)
{
   printf("++ GPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}

int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Example: sharing a cudeMallocManaged array between CPU and GPU

The array name was changed to A:

__global__ void GPU_func(int A[ ], in N)
{
   printf("++ GPU sees A: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", A[i]);
   printf("\n");

   for (int i = 0; i < 10; i++ ) A[i]+=i; // Update A[i]
}

int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Example: sharing a cudeMallocManaged array between CPU and GPU

The array size was changed to N:

__global__ void GPU_func(int A[ ], in N)
{
   printf("++ GPU sees A: ");
   for (int i = 0; i < N; i++ ) printf("%d ", A[i]);
   printf("\n");

   for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i]
}

int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 );
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

DEMO: /home/cs355001/demo/CUDA/2-unified-mem/shared-malloc-array.cu

Declaring a __global__ kernel function

Recall that C requires a function to be defined or declared before it is used:

__global__ void GPU_func(int A[ ], in N) // Define function
{
   printf("++ GPU sees A: ");
   for (int i = 0; i < N; i++ ) printf("%d ", A[i]);
   printf("\n");

   for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i]
}

int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 ); // Use a function
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
} 

Declaring a __global__ kernel function

We get a compile error if a function is defined after it was used:



int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 ); // Use a function
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
}

__global__ void GPU_func(int A[ ], in N) // Define function
{
   printf("++ GPU sees A: ");
   for (int i = 0; i < N; i++ ) printf("%d ", A[i]);
   printf("\n");

   for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i]
}

Declaring a __global__ kernel function

We can use the following CUDA C syntax to declare a __global__ function:

__global__ void GPU_func(int A[ ], in N); // Declaration    

int main()
{
   int   *x;     
   cudaMallocManaged( &x,  10*sizeof( int ) ); 

   for (int i = 0; i < 10; i++ ) x[i] = 1000+i;

   GPU_func<<< 1, 1 >>>( x, 10 ); // Use a function
   cudaDeviceSynchronize();

   printf("** CPU sees x: ");
   for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
   printf("\n");
}

__global__ void GPU_func(int A[ ], in N) // Define function
{
   printf("++ GPU sees A: ");
   for (int i = 0; i < N; i++ ) printf("%d ", A[i]);
   printf("\n");

   for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i]
}

DEMO: /home/cs355001/demo/CUDA/2-unified-mem/shared-malloc-array.cu