i have following problem. have implemented several different parallel reduction algorithms , of them work correctly if i'm reducing one value per kernel. need reduce several (21) , i've no idea why it's working , not.
the steps performed are:
- calculating relevant values per thread (in example set them 1 since it's showing same behavior)
- load them shared memory
- sync threads within block
- reduce values down in shared memory
here complete code can cpy&pst , run.
#include <stdio.h> #include <cuda_runtime.h> // switch compiler flag if don't have sdk's helper_cuda.h file #if 1 #include "helper_cuda.h" #else #define checkcudaerrors(val) (val) #define getlastcudaerror(msg) #endif #ifdef __cdt_parser__ #define __global__ #define __device__ #define __shared__ #define __host__ #endif // compute sum of val on num threads __device__ float localsum(const float& val, volatile float* reductionspace, const uint& localid) { reductionspace[localid] = val; // load data shared mem __syncthreads(); // complete loop unroll if (localid < 128) reductionspace[localid] += reductionspace[localid + 128]; __syncthreads(); if (localid < 64) reductionspace[localid] += reductionspace[localid + 64]; __syncthreads(); // within 1 warp (=32 threads) instructions simd synchronous // -> __syncthreads() not needed if (localid < 32) { reductionspace[localid] += reductionspace[localid + 32]; reductionspace[localid] += reductionspace[localid + 16]; reductionspace[localid] += reductionspace[localid + 8]; reductionspace[localid] += reductionspace[localid + 4]; reductionspace[localid] += reductionspace[localid + 2]; reductionspace[localid] += reductionspace[localid + 1]; } ## edit: here need sync in order guarantee thread id 0 done... ## __syncthreads(); return reductionspace[0]; } __global__ void d_kernel(float* od, int n) { extern __shared__ float reductionspace[]; int g_idx = blockidx.x * blockdim.x + threadidx.x; const unsigned int linid = threadidx.x; __shared__ float partialsums[21]; float tmp[6] = { 0, 0, 0, 0, 0, 0 }; // simplification computations remove - version still shows same behaviour if (g_idx < n) { tmp[0] = 1.0f; tmp[1] = 1.0f; tmp[2] = 1.0f; tmp[3] = 1.0f; tmp[4] = 1.0f; tmp[5] = 1.0f; } float res = 0.0f; int c = 0; (int = 0; < 6; ++i) { (int j = i; j < 6; ++j, ++c) { res = tmp[i] * tmp[j]; // compute sum of values res blockdim.x threads. uses // shared memory reductionspace calculations partialsums[c] = localsum(res, reductionspace, linid); } } __syncthreads(); // write sum values block if (linid < 21) { atomicadd(&od[linid], partialsums[linid]); } } int main() { int w = 320; int h = 240; int n = w * h; // ------------------------------------------------------------------------------------ float *d_out; checkcudaerrors(cudamalloc(&d_out, 21 * sizeof(float))); float* h_out = new float[21]; int dimblock = 256; int dimgrid = (n - 1) / dimblock + 1; int sharedmemsize = dimblock * sizeof(float); printf("w: %d\n", w); printf("h: %d\n", h); printf("dimblock: %d\n", dimblock); printf("dimgrid: %d\n", dimgrid); printf("sharedmemsize: %d\n", sharedmemsize); int failcounter = 0; float target = (float) n; int c = 0; // ------------------------------------------------------------------------------------ // run kernel 200 times (int run = 0; run < 200; ++run) { cudamemset(d_out, 0, 21 * sizeof(float)); d_kernel<<<dimgrid, dimblock, sharedmemsize>>>(d_out, n);; getlastcudaerror("d_kernel"); checkcudaerrors(cudamemcpy(h_out, d_out, 21 * sizeof(float), cudamemcpydevicetohost)); // check if output has target value // since threads value 1 kernel output corresponds counting elements w*h=n bool failed = false; (int = 0; < 21; ++i) { if (abs(h_out[i] - target) > 0.01f) { ++failcounter; failed = true; } } // if failed, print elements show 1 failed if (failed) { c = 0; (int = 0; < 6; ++i) { (int j = i; j < 6; ++j, ++c) { printf("%10.7f ", h_out[c]); } printf("\n"); } } } printf("failcounter: %d\n", failcounter); // ------------------------------------------------------------------------------------ delete[] h_out; checkcudaerrors(cudafree(d_out)); // ------------------------------------------------------------------------------------ return 0; }
some comments:
blocksize 256 - unrolled loop in localsum() checks right threadids. mentioned @ beginning, out of 200 runs it's correct, 2 values wrong , 150 or wrong.
and doesn't have to floating point precision since 1x1 multiplied , stored in variable res in d_kernel(). can see threads or blocks don't started, don't know why. :/
just looking @ results should obvious there kind of race-condition can't see problem.
has idea problem is?
edit:
i tested lot of things , saw has blocksize. if reduce smth <=64 , change localsum() accordingly working expected.
but makes no sense me?! still nothing else here normal parallel reduction shared memory difference 21 times per thread.
edit 2:
now i'm confused. the problem unrolling loop!! or better said synchronizing warp. following localsum() code works:
// compute sum of val on num threads __device__ float localsum(const float& val, volatile float* reductionspace, const uint& localid) { reductionspace[localid] = val; // load data shared mem __syncthreads(); (unsigned int s = blockdim.x / 2; s > 0; s >>= 1) { if (localid < s) { reductionspace[localid] += reductionspace[localid + s]; } __syncthreads(); } return reductionspace[0]; }
but if unroll last warp , not synchronize between threads, again 2 or 3 wrong results out of 2000 runs. following code not work:
// compute sum of val on num threads __device__ float localsum(const float& val, volatile float* reductionspace, const uint& localid) { reductionspace[localid] = val; // load data shared mem __syncthreads(); (unsigned int s = blockdim.x / 2; s > 32; s >>= 1) { if (localid < s) { reductionspace[localid] += reductionspace[localid + s]; } __syncthreads(); } if (localid < 32) { reductionspace[localid] += reductionspace[localid + 32]; reductionspace[localid] += reductionspace[localid + 16]; reductionspace[localid] += reductionspace[localid + 8]; reductionspace[localid] += reductionspace[localid + 4]; reductionspace[localid] += reductionspace[localid + 2]; reductionspace[localid] += reductionspace[localid + 1]; } return reductionspace[0]; }
but how make sense, since cuda executes 1 warp (32 threads) simultaneously , no __syncthreads() needed?!
i don't need post me working code here i'm asking lot of experience , deep knowledge in cuda programming describe me underlying problem here. or @ least give me hint.
the solution easy i'm ashamed tell it. blinded , looked everywhere not @ obvious code. simple __syncthreads() missing before return statement in localsum(). bc last warp beeing executed simultaneously it's not guaranteed 1 threadid 0 done... such stupid mistake , didn't see it.
sorry trouble.. :)