The Unified Memory technique can manage the following kinds of variables:
|
|
I will show you an example on how to use cudaMallowManaged( ) next.
Recall the example: sharing a global array between the CPU and the GPU:
__managed__ int x[10]; // Defines global shared array variable __global__ void GPU_func( ) { printf("++ GPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i] } int main() { for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( ); cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
DEMO: /home/cs355001/demo/CUDA/2-unified-mem/shared-global-array.cu
I will turn the __managed__ global array into a cudeMallocManaged array:
__managed__ int x[10]; // Defines global shared array variable
__global__ void GPU_func( )
{
printf("++ GPU sees x: ");
for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
printf("\n");
for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}
int main()
{
for (int i = 0; i < 10; i++ ) x[i] = 1000+i;
GPU_func<<< 1, 1 >>>( );
cudaDeviceSynchronize();
printf("** CPU sees x: ");
for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
printf("\n");
}
|
We do not use a global array variable - so delete it:
__global__ void GPU_func( ) { printf("++ GPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i] } int main() { for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( ); cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
We create a cudaMallocManaged array variable - I used the same array name (x) !!:
__global__ void GPU_func( ) { printf("++ GPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i] } int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( ); cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
We pass the array x and the array size (10) as parameters to to kernel call:
__global__ void GPU_func( )
{
printf("++ GPU sees x: ");
for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
printf("\n");
for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i]
}
int main()
{
int *x;
cudaMallocManaged( &x, 10*sizeof( int ) );
for (int i = 0; i < 10; i++ ) x[i] = 1000+i;
GPU_func<<< 1, 1 >>>( x, 10 );
cudaDeviceSynchronize();
printf("** CPU sees x: ");
for (int i = 0; i < 10; i++ ) printf("%d ", x[i]);
printf("\n");
}
|
We will need to rewrite the kernel function with the parameter variables:
__global__ void GPU_func(int A[ ], in N) { printf("++ GPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); for (int i = 0; i < 10; i++ ) x[i]+=i; // Update x[i] } int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( x, 10 ); cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
The array name was changed to A:
__global__ void GPU_func(int A[ ], in N) { printf("++ GPU sees A: "); for (int i = 0; i < 10; i++ ) printf("%d ", A[i]); printf("\n"); for (int i = 0; i < 10; i++ ) A[i]+=i; // Update A[i] } int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( x, 10 ); cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
The array size was changed to N:
__global__ void GPU_func(int A[ ], in N) { printf("++ GPU sees A: "); for (int i = 0; i < N; i++ ) printf("%d ", A[i]); printf("\n"); for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i] } int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( x, 10 ); cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
DEMO: /home/cs355001/demo/CUDA/2-unified-mem/shared-malloc-array.cu
Recall that C requires a function to be defined or declared before it is used:
__global__ void GPU_func(int A[ ], in N) // Define function { printf("++ GPU sees A: "); for (int i = 0; i < N; i++ ) printf("%d ", A[i]); printf("\n"); for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i] } int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( x, 10 ); // Use a function cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } |
We get a compile error if a function is defined after it was used:
int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( x, 10 ); // Use a function cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } __global__ void GPU_func(int A[ ], in N) // Define function { printf("++ GPU sees A: "); for (int i = 0; i < N; i++ ) printf("%d ", A[i]); printf("\n"); for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i] } |
We can use the following CUDA C syntax to declare a __global__ function:
__global__ void GPU_func(int A[ ], in N); // Declaration int main() { int *x; cudaMallocManaged( &x, 10*sizeof( int ) ); for (int i = 0; i < 10; i++ ) x[i] = 1000+i; GPU_func<<< 1, 1 >>>( x, 10 ); // Use a function cudaDeviceSynchronize(); printf("** CPU sees x: "); for (int i = 0; i < 10; i++ ) printf("%d ", x[i]); printf("\n"); } __global__ void GPU_func(int A[ ], in N) // Define function { printf("++ GPU sees A: "); for (int i = 0; i < N; i++ ) printf("%d ", A[i]); printf("\n"); for (int i = 0; i < N; i++ ) A[i]+=i; // Update A[i] } |
DEMO: /home/cs355001/demo/CUDA/2-unified-mem/shared-malloc-array.cu