|  
 
  |  
 
  |  
In CUDA, we can assign each thread with a 2-dimensional identifier (and even a 3-dim identfier !!)
 
  |  
int threadRowID, threadColId; threadRowID = blockIdx.x * blockDim.x + threadIdx.x; threadColId = blockIdx.y * blockDim.y + threadIdx.y;  |  
Example:
__global__ void hello( )
{
   int  threadRowID, threadColId;
   threadRowID = blockIdx.x * blockDim.x + threadIdx.x;
   threadColId = blockIdx.y * blockDim.y + threadIdx.y;
   
   /* ------------------------------------
      Print the thread's 2 dim grid ID
      ------------------------------------ */
   printf("Blk: (%d,%d) Thread: (%d,%d) -> Row/Col = (%d,%d)\n",  
                blockIdx.x, blockIdx.y,  
                threadIdx.x, threadIdx.y,  
                threadRowID, threadColId);
}
int main()
{
   dim3 blockShape = dim3( 2, 3 );
   dim3 gridShape  = dim3( 3, 2 );
   hello<<< gridShape, blockShape >>>( );  // Launch a 2 dim grid of threads  
   printf("I am the CPU: Hello World ! \n");
   cudaDeviceSynchronize();
   return 0;
}
 |  
        
  
     /home/cs355001/demo/CUDA/1-intro/hello-2dim-ID
  Output:
I am the CPU: Hello World ! 
Blk: (2,0) Thread: (0,0) -> Row/Col = (4,0)
Blk: (2,0) Thread: (1,0) -> Row/Col = (5,0)
Blk: (2,0) Thread: (0,1) -> Row/Col = (4,1)
Blk: (2,0) Thread: (1,1) -> Row/Col = (5,1)   // If you look carefully at   
Blk: (2,0) Thread: (0,2) -> Row/Col = (4,2)   // the last column, the
Blk: (2,0) Thread: (1,2) -> Row/Col = (5,2)   // indexes span these ranges:
Blk: (0,0) Thread: (0,0) -> Row/Col = (0,0)   //
Blk: (0,0) Thread: (1,0) -> Row/Col = (1,0)   //   (0,0) (0,1) ...   (0,5)
Blk: (0,0) Thread: (0,1) -> Row/Col = (0,1)   //   (1,0) (1,1) ...   (1,5)
Blk: (0,0) Thread: (1,1) -> Row/Col = (1,1)   //   (2,0) (2,1) ...   (2,5)
Blk: (0,0) Thread: (0,2) -> Row/Col = (0,2)   //   ...
Blk: (0,0) Thread: (1,2) -> Row/Col = (1,2)   //   (5,0) (5,1) ...   (5,5)
Blk: (2,1) Thread: (0,0) -> Row/Col = (4,3)   //
Blk: (2,1) Thread: (1,0) -> Row/Col = (5,3)   // So you can use these indexes    
Blk: (2,1) Thread: (0,1) -> Row/Col = (4,4)   // to access matrixes !!!
Blk: (2,1) Thread: (1,1) -> Row/Col = (5,4)
Blk: (2,1) Thread: (0,2) -> Row/Col = (4,5)
Blk: (2,1) Thread: (1,2) -> Row/Col = (5,5)
Blk: (0,1) Thread: (0,0) -> Row/Col = (0,3)
Blk: (0,1) Thread: (1,0) -> Row/Col = (1,3)
Blk: (0,1) Thread: (0,1) -> Row/Col = (0,4)
Blk: (0,1) Thread: (1,1) -> Row/Col = (1,4)
Blk: (0,1) Thread: (0,2) -> Row/Col = (0,5)
Blk: (0,1) Thread: (1,2) -> Row/Col = (1,5)
Blk: (1,0) Thread: (0,0) -> Row/Col = (2,0)
Blk: (1,0) Thread: (1,0) -> Row/Col = (3,0)
Blk: (1,0) Thread: (0,1) -> Row/Col = (2,1)
Blk: (1,0) Thread: (1,1) -> Row/Col = (3,1)
Blk: (1,0) Thread: (0,2) -> Row/Col = (2,2)
Blk: (1,0) Thread: (1,2) -> Row/Col = (3,2)
Blk: (1,1) Thread: (0,0) -> Row/Col = (2,3)
Blk: (1,1) Thread: (1,0) -> Row/Col = (3,3)
Blk: (1,1) Thread: (0,1) -> Row/Col = (2,4)
Blk: (1,1) Thread: (1,1) -> Row/Col = (3,4)
Blk: (1,1) Thread: (0,2) -> Row/Col = (2,5)
Blk: (1,1) Thread: (1,2) -> Row/Col = (3,5)
 |  
 
  |  
 
  |  
I won't go into the details, it's similar to the 2-D example above
    /usr/local/cuda/samples/           
 |  
   /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery           
 Output on ghost01:
Detected 1 CUDA Capable device(s)
Device 0: "Quadro P1000"
  CUDA Driver Version / Runtime Version          9.2 / 9.0
  CUDA Capability Major/Minor version number:    6.1
  Total amount of global memory:                 4040 MBytes (4235919360 bytes)
  ( 5) Multiprocessors, (128) CUDA Cores/MP:     640 CUDA Cores
  GPU Max Clock rate:                            1481 MHz (1.48 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 1048576 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)              -- max block size
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)    -- max grid size
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() 
       with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 9.2, 
CUDA Runtime Version = 9.0, NumDevs = 1
Result = PASS
 |