__managed__ C-global-var-definition; |
When you define the variable as global:
|
__managed__ int x; // Accessible by ALL CPU and GPU functions !!! __global__ void GPU_func( ) { printf("GPU sees x = %d\n", x); x = 4444; } int main() { x = 1234; GPU_func<<< 1, 1 >>>( ); // Start GPU function cudaDeviceSynchronize(); // Wait until GPU kernel function finishes !! printf("CPU sees x = %d\n", x); // Now obtain the result !! return 0; } |
Key:
|
__managed__ int x[10]; // Defines global shared array variable !!! __global__ void GPU_func( ) { printf("++ GPU sees x: "); for (int i = 0; i < 10; i++ ) { printf("%d ", x[i]); x[i] = x[i] + i; // GPU updates x[i] } printf("\n"); } int main() { for (int i = 0; i < 10; i++ ) // CPU initializes x[ ] x[i] = 1000+i; GPU_func<<< 1, 1 >>>( ); // GPU uses CPU's values and updates x[ ] cudaDeviceSynchronize(); // CPU WAITS until GPU finishes !!! printf("** CPU sees x: "); // CPU then access GPU's updated values for (int i = 0; i < 10; i++ ) { printf("%d ", x[i]); } printf("\n"); return 0; } |
/home/cs355001/demo/CUDA/2-unified-mem/shared-global-array Output: ++ GPU sees x: 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 ** CPU sees x: 1000 1002 1004 1006 1008 1010 1012 1014 1016 1018 |
|
The following construct is illegal:
int main() { int a; __managed__ a = 1234; // Illegal: cannot define "managed" local variable return 0; } |
/home/cs355001/demo/CUDA/2-unified-mem/shared-local-fail.cu nvcc -g -o shared-local-fail.o -c shared-local-fail.cu shared-local-fail.cu(13): error: expected a declaration Line 13 is: __managed__ a = 1234; // Illegal: cannot define "managed" local variable |
int *a; // You must use a pointer variable cudaMallocManaged( &a, sizeof( int ) ); // Create a shared local int variable |
Example:
__global__ void GPU_func( int *x ) { printf("GPU sees *x = %d\n", *x); // *x access int variable *x = 4444; // Update shared local variable } int main() { int *a; // Must use a pointer variable cudaMallocManaged(&a, sizeof(int) ); // Create a shared managed variable // and make a point to -> variable *a = 1234; // Assign *a = 1234; GPU_func<<< 1, 1 >>>( a ); // Pass address of the variable to GPU_func( ) cudaDeviceSynchronize(); // Wait until GPU is done printf("CPU sees *a = %d\n", *a); return 0; } |
/home/cs355001/demo/CUDA/2-unified-mem/shared-local Output: GPU sees *x = 1234 CPU sees *a = 4444 |
Example:
__global__ void GPU_func( int *a ) ; // Declare the GPU function int main() { int *a; // a is a local variable cudaMallocManaged(&a, 10*sizeof(int) ); // a -> shared 10 int var (= array) for (int i = 0; i < 10; i++ ) a[i] = 1000; GPU_func<<< 1, 1 >>>( a ); // Must pass a to GPU_function to "share" cudaDeviceSynchronize(); // Wait for data transfer to finish // before accessing the variable a printf("** CPU sees a: "); for (int i = 0; i < 10; i++ ) { printf("%d ", a[i]); } printf("\n"); return 0; } __global__ void GPU_func( int *a ) { printf("++ GPU sees a: "); for (int i = 0; i < 10; i++ ) { printf("%d ", a[i]); a[i] = a[i] + i; // GPU_Func updates a[ ] } printf("\n"); } |
/home/cs355001/demo/CUDA/2-unified-mem/shared-local-array Output: ++ GPU sees a: 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ** CPU sees a: 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 |