CUDA implementation of the odd-even sort algorithm

We will create N threads, one thread per array element:

The threads will repeat the sorting steps until the variable isSorted == 1

CUDA implementation of the odd-even sort algorithm

The even numbered threads will start first:

Thread e compares A[e] with A[e+1] and swap them if they are out of order
And: sets   isSorted=0

CUDA implementation of the odd-even sort algorithm

The odd numbered threads will run next:

Thread o compares A[o] with A[o+1] and swap them if they are out of order
And: sets   isSorted=0

The CUDA odd-even sort algorithm - CPU part

The CPU code:   (1) allocate arrays and (2) then launches the threads (using a <<< 1,N >>> grid):

int main(int argc, char *argv[])
{
  int N = Input array size

  

 

  
    
    
 
 
  

  

  
} 

N will be specified by the user input

The CUDA odd-even sort algorithm - CPU part

We define the reference variables to help us allocate the shared arrays:

int main(int argc, char *argv[])
{
  int N = Input array size

  int *A;


    
    
 
  
  





 
} 

(This is CUDA syntax... i.e, how CUDA provide the dynamic shared array to us)

The CUDA odd-even sort algorithm - CPU part

Allocate the 3 shared vectors (as 1-dim arrays):

int main(int argc, char *argv[])
{
  int N = Input array size

  int *A;

  /* =======================================
     Allocate a shared array
     ======================================= */
  cudaMallocManaged(&A, N*sizeof(float));
 
  // initialize array A (code omitted)






} 

(This is CUDA syntax... i.e, how CUDA provide the dynamic shared array to us)

The CUDA odd-even sort algorithm - CPU part

Launch (at least) N threads as a <<< 1,N >>> grid to perform the odd-even sort:

int main(int argc, char *argv[])
{
  int N = Input array size

  int *A;

  /* =======================================
     Allocate 3 shared matrices (as arrays)
     ======================================= */
  cudaMallocManaged(&A, N*sizeof(float));

  // initialize array A (code omitted)

  oddEvenSort<<< 1, N >>>(A, N);   // Using > 1 block can result in error !

                   // The reason will need to be explained later...


} 

We will write the kernel code for vectorAdd( ) next...

The CUDA odd-even sort algorithm - GPU part

First, make each thread computes its own unique ID i:

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( !isSorted )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 )      // Even phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )      // Odd phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

The CUDA odd-even sort algorithm - GPU part

Repeat the sort step (compare and swap) until the array is sorted:

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 )      // Even phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
            SORT step
    if ( i%2 == 1 && i < n-1 )      // Odd phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

The CUDA odd-even sort algorithm - GPU part

Assume that the array is sorted (isSorted is updated to 0 if we need to swap):

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 )      // Even phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )      // Odd phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

The CUDA odd-even sort algorithm - GPU part

Even phase:   allow only the "even" threads to do the compare and swap step:

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )      // Odd phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

The CUDA odd-even sort algorithm - GPU part

Even phase:   make sure that all threads are finished before moving onwards:

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )      // Odd phase
    {
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

The CUDA odd-even sort algorithm - GPU part

Odd phase:   allow only the "odd" threads to do the compare and swap step:

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )  // odd phase (test is exec'ed by ALL threads)
    {  // ONLY an "odd" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

The CUDA odd-even sort algorithm - GPU part

Odd phase:   make sure that all threads are finished before moving onwards:

__global__ void oddEvenSort(int *a, int n)
{
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )  // odd phase (test is exec'ed by ALL threads)
    {  // ONLY an "odd" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

Failed DEMO: /home/cs355001/demo/CUDA/7-sort/odd-even.cu

Problem:   the variable isSorted is a local variable (not shared...)

Local variables are "private" variables for a thread:

__global__ void oddEvenSort(int *a, int n)
{ // EACH thread has its own copy of local variables
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID for each thread

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )  // odd phase (test is exec'ed by ALL threads)
    {  // ONLY an "odd" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0;
       }
    }
    __syncthreads(); // All threads must finish before move on
  }
}

Problem:   the variable isSorted is a local variable (not shared...)

When thread i updates isSorted, the isSorted variable in other threads are not changed:

__global__ void oddEvenSort(int *a, int n)
{ // EACH thread has its own copy of local variables
  int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0; // Thread i will ONLY update its copy !!
       }                   
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )  // odd phase (test is exec'ed by ALL threads)
    {  // ONLY an "odd" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0; // Thread i will ONLY update its copy !!
       }                   
    }
    __syncthreads(); // All threads must finish before move on
  }
}

Solution:   we must make all threads use (share) the same variable isSorted

Recall that each stream multi-processor in the GPU has a shared memory:

Variables stored in the shared memory will be shared between all threads running on the same multi-processor (= in the same thread block)

Solution:   we must make all threads use (share) the same variable isSorted

The keyword __shared__ will allocate a variable in the shared memory:

__global__ void oddEvenSort(int *a, int n)
{ // EACH thread will share the variable isSorted
  __shared__ int isSorted;
  int i = blockIdx.x*blockDim.x + threadIdx.x; // Unique ID

  isSorted = 0;

  while ( isSorted == 0 )
  {
    isSorted = 1;

    if ( i%2 == 0 && i < n-1 ) // Even phase (test is exec'ed by ALL threads)
    {  // ONLY an "even" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0; // ALL threads will see this update !!
       }                   
    }
    __syncthreads(); // All threads must finish before move on

    if ( i%2 == 1 && i < n-1 )  // odd phase (test is exec'ed by ALL threads)
    {  // ONLY an "odd" thread execute this if-statement
       if (a[i] > a[i+1])
       {
          SWAP(a[i], a[i+1]);
          isSorted = 0; // ALL threads will see this update !!
       }                   
    }
    __syncthreads(); // All threads must finish before move on
  }
}

DEMO: /home/cs355001/demo/CUDA/7-sort/odd-even.cu

__shared__ variables are shared among threads in the same thread block

  • The following CUDA program shows the sharing of the same variable x among 10 threads in 1 thread block:

    __global__ void useShared()
    {
        __shared__ int x;	// Shared variable: 1 copy per thread block
    
        x = 1;
        __syncthreads();
    
        atomicAdd(&x, 1);
        // x++;			// x++ is NOT atomic !!!
        __syncthreads();
    
        printf("block id = %d(%d), x = %d\n", blockIdx.x, threadIdx.x, x);
    }
    
    int main(int argc, char *argv[])
    {
        // 1 thread block with 10 threads that SHARE variable x
        useShared<<<1, 10>>>();
    
        // Wait for GPU to finish before accessing on host
        cudaDeviceSynchronize();
    }
    

    Output: x = 11 (10 times)

DEMO: demo/CUDA/8-shared-vars/shared_var.cu

__shared__ variables are shared among threads in the same thread block

  • The following CUDA program shows the sharing of the 5 copyes of x among 2 threads in 5 thread blocks:

    __global__ void useShared()
    {
        __shared__ int x;	// Shared variable: 1 copy per thread block
    
        x = 1;
        __syncthreads();
    
        atomicAdd(&x, 1);
        // x++;			// x++ is NOT atomic !!!
        __syncthreads();
    
        printf("block id = %d(%d), x = %d\n", blockIdx.x, threadIdx.x, x);
    }
    
    int main(int argc, char *argv[])
    {
        // 5 thread blocks with 2 threads each that SHARE variable x
        useShared<<<5, 2>>>();
    
        // Wait for GPU to finish before accessing on host
        cudaDeviceSynchronize();
    }
    

    Output: x = 3 (10 times)

DEMO: demo/CUDA/8-shared-vars/shared_var2.cu