|
K = # threads / thread block
|
int K = atoi( argv[2] ); // Read K in from the command line
/* ==============================================
Find # blocks needed to launch N*N threads
where each block contains K threads
============================================== */
int NBlks = ceil((float) N*N / K );
// ==================================================================
// Run kernel on the GPU using NBlks block, K thread/per block
matrixMult<<
|
Program: /home/cs355001/demo/CUDA/4-mult-matrix2.cu |
|
From CUDA manual pages:
__host__ cudaError_t
cudaOccupancyMaxPotentialBlockSize ( int* minGridSize, int* blockSize,
T func, size_t dynamicSMemSize = 0, int blockSizeLimit = 0 )
Returns grid and block size that achieves maximum potential occupancy for a device function.
Parameters
minGridSize
- Returned minimum grid size needed to achieve the best potential occupancy
blockSize
- Returned block size
func
- Device function symbol
dynamicSMemSize
- Per-block dynamic shared memory usage intended, in bytes
blockSizeLimit
- The maximum block size func is designed to work with. 0 means no limit.
Function return values:
cudaSuccess, cudaErrorInvalidDevice, cudaErrorInvalidDeviceFunction,
cudaErrorInvalidValue, cudaErrorUnknown,
Function description:
Returns in *minGridSize and *blocksize a suggested grid / block size pair
that achieves the best potential occupancy (i.e. the maximum number
of active warps with the smallest number of blocks).
|
int minGridSize; // The minimum grid size needed to achieve
// the maximum occupancy for a full device launch
int BlkSize; // Block size to used for max occupancy
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &BlkSize, matrixMult, 0, 0);
printf("Computed: minGridSize = %d, BlkSize = %d\n", minGridSize, BlkSize);
int GridSize = ceil((float) N*N / BlkSize ); // Round up to integral grid size
printf("N*N = %d/BlkSize = %lf ---> GridSize = %d\n",
N*N, (float) N*N/BlkSize, GridSize);
// Run kernel on the GPU using NBlks block, K thread/per block
matrixMult<<<GridSize, BlkSize>>>(N, C, A, B);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
|
Program: /home/cs355001/demo/CUDA/4-mult-matrix/mult-matrix-auto.cu
Output:
cs355@ghost01 (2095)> mult-matrix-auto 1000
Computed: minGridSize = 10, BlkSize = 1024
N*N = 1000000/BlkSize = 976.562500 ---> GridSize = 977
Elasped time = 42972 micro secs
|