CUDA - Multiple parallel reductions sometimes fail -


i have following problem. have implemented several different parallel reduction algorithms , of them work correctly if i'm reducing one value per kernel. need reduce several (21) , i've no idea why it's working , not.

the steps performed are:

  • calculating relevant values per thread (in example set them 1 since it's showing same behavior)
  • load them shared memory
  • sync threads within block
  • reduce values down in shared memory

here complete code can cpy&pst , run.

#include <stdio.h> #include <cuda_runtime.h>  // switch compiler flag if don't have sdk's helper_cuda.h file #if 1 #include "helper_cuda.h" #else #define checkcudaerrors(val) (val) #define getlastcudaerror(msg) #endif  #ifdef __cdt_parser__ #define __global__ #define __device__ #define __shared__ #define __host__ #endif  // compute sum of val on num threads __device__ float localsum(const float& val, volatile float* reductionspace, const uint& localid) {     reductionspace[localid] = val;  // load data shared mem     __syncthreads();      // complete loop unroll     if (localid < 128) reductionspace[localid] += reductionspace[localid + 128];     __syncthreads();      if (localid < 64) reductionspace[localid] += reductionspace[localid + 64];     __syncthreads();      // within 1 warp (=32 threads) instructions simd synchronous     // -> __syncthreads() not needed     if (localid < 32)     {         reductionspace[localid] += reductionspace[localid + 32];         reductionspace[localid] += reductionspace[localid + 16];         reductionspace[localid] += reductionspace[localid + 8];         reductionspace[localid] += reductionspace[localid + 4];         reductionspace[localid] += reductionspace[localid + 2];         reductionspace[localid] += reductionspace[localid + 1];     }      ## edit: here need sync in order guarantee thread id 0 done... ##     __syncthreads();      return reductionspace[0]; }  __global__ void d_kernel(float* od, int n) {     extern __shared__ float reductionspace[];     int g_idx = blockidx.x * blockdim.x + threadidx.x;     const unsigned int linid = threadidx.x;     __shared__ float partialsums[21];      float tmp[6] =     { 0, 0, 0, 0, 0, 0 };      // simplification computations remove - version still shows same behaviour     if (g_idx < n)     {         tmp[0] = 1.0f;         tmp[1] = 1.0f;         tmp[2] = 1.0f;         tmp[3] = 1.0f;         tmp[4] = 1.0f;         tmp[5] = 1.0f;     }      float res = 0.0f;     int c = 0;     (int = 0; < 6; ++i)     {         (int j = i; j < 6; ++j, ++c)         {             res = tmp[i] * tmp[j];             // compute sum of values res blockdim.x threads. uses             // shared memory reductionspace calculations             partialsums[c] = localsum(res, reductionspace, linid);          }     }     __syncthreads();      // write sum values block     if (linid < 21)     {         atomicadd(&od[linid], partialsums[linid]);     } }  int main() {     int w = 320;     int h = 240;     int n = w * h;      // ------------------------------------------------------------------------------------     float *d_out;     checkcudaerrors(cudamalloc(&d_out, 21 * sizeof(float)));     float* h_out = new float[21];      int dimblock = 256;     int dimgrid = (n - 1) / dimblock + 1;     int sharedmemsize = dimblock * sizeof(float);      printf("w: %d\n", w);     printf("h: %d\n", h);     printf("dimblock: %d\n", dimblock);     printf("dimgrid: %d\n", dimgrid);     printf("sharedmemsize: %d\n", sharedmemsize);      int failcounter = 0;     float target = (float) n;     int c = 0;     // ------------------------------------------------------------------------------------      // run kernel 200 times     (int run = 0; run < 200; ++run)     {         cudamemset(d_out, 0, 21 * sizeof(float));         d_kernel<<<dimgrid, dimblock, sharedmemsize>>>(d_out, n);;         getlastcudaerror("d_kernel");          checkcudaerrors(cudamemcpy(h_out, d_out, 21 * sizeof(float), cudamemcpydevicetohost));          // check if output has target value         // since threads value 1 kernel output corresponds counting elements w*h=n         bool failed = false;         (int = 0; < 21; ++i)         {             if (abs(h_out[i] - target) > 0.01f)             {                 ++failcounter;                 failed = true;             }         }          // if failed, print elements show 1 failed         if (failed)         {             c = 0;             (int = 0; < 6; ++i)             {                 (int j = i; j < 6; ++j, ++c)                 {                     printf("%10.7f ", h_out[c]);                 }                 printf("\n");             }         }     }      printf("failcounter: %d\n", failcounter);      // ------------------------------------------------------------------------------------     delete[] h_out;     checkcudaerrors(cudafree(d_out));     // ------------------------------------------------------------------------------------      return 0; } 

some comments:

blocksize 256 - unrolled loop in localsum() checks right threadids. mentioned @ beginning, out of 200 runs it's correct, 2 values wrong , 150 or wrong.

and doesn't have to floating point precision since 1x1 multiplied , stored in variable res in d_kernel(). can see threads or blocks don't started, don't know why. :/

just looking @ results should obvious there kind of race-condition can't see problem.

has idea problem is?

edit:

i tested lot of things , saw has blocksize. if reduce smth <=64 , change localsum() accordingly working expected.

but makes no sense me?! still nothing else here normal parallel reduction shared memory difference 21 times per thread.

edit 2:

now i'm confused. the problem unrolling loop!! or better said synchronizing warp. following localsum() code works:

// compute sum of val on num threads __device__ float localsum(const float& val, volatile float* reductionspace, const uint& localid) {     reductionspace[localid] = val;  // load data shared mem     __syncthreads();      (unsigned int s = blockdim.x / 2; s > 0; s >>= 1)     {         if (localid < s)         {             reductionspace[localid] += reductionspace[localid + s];         }          __syncthreads();     }      return reductionspace[0]; } 

but if unroll last warp , not synchronize between threads, again 2 or 3 wrong results out of 2000 runs. following code not work:

// compute sum of val on num threads __device__ float localsum(const float& val, volatile float* reductionspace, const uint& localid) {     reductionspace[localid] = val;  // load data shared mem     __syncthreads();      (unsigned int s = blockdim.x / 2; s > 32; s >>= 1)     {         if (localid < s)         {             reductionspace[localid] += reductionspace[localid + s];         }          __syncthreads();     }      if (localid < 32)     {         reductionspace[localid] += reductionspace[localid + 32];         reductionspace[localid] += reductionspace[localid + 16];         reductionspace[localid] += reductionspace[localid + 8];         reductionspace[localid] += reductionspace[localid + 4];         reductionspace[localid] += reductionspace[localid + 2];         reductionspace[localid] += reductionspace[localid + 1];     }      return reductionspace[0]; } 

but how make sense, since cuda executes 1 warp (32 threads) simultaneously , no __syncthreads() needed?!

i don't need post me working code here i'm asking lot of experience , deep knowledge in cuda programming describe me underlying problem here. or @ least give me hint.

the solution easy i'm ashamed tell it. blinded , looked everywhere not @ obvious code. simple __syncthreads() missing before return statement in localsum(). bc last warp beeing executed simultaneously it's not guaranteed 1 threadid 0 done... such stupid mistake , didn't see it.

sorry trouble.. :)