c++ - how to avoid float atomic add in opencl -


i need resolve sparse matrix-vector multiplication in opencl, time have when de kernel finish slow. assume slow time because in kernel exist atomic-add function.

this de code of kernel:

#pragma opencl extension cl_khr_global_int32_base_atomics : enable  void float_atomic_add(__global float *loc, const float f){ private float old = *loc; private float sum = old + f; while(atomic_cmpxchg((__global int*)loc, *((int*)&old), *((int*)&sum)) !=              *((int*)&old)){     old = *loc;     sum = old + f; } }     __kernel void forward(__global int* col, __global int* row, __global float* data, __global int* symmlor_x, __global int* symm_xpixel,__global int*    symmlor_y, __global int* symm_ypixel, __global int* symmlor_xy,__global int*           symm_xypixel, __global int* symmlor_z, __global int* symm_zpixel, __global float* x, __global float* b){    __private int = get_global_id(0);   __private int p, pixel,lor, lor_y, lor_x, lor_xy;   __private int lor_z, pixel_z;   __private float v;      pixel = col[i]; // j   v = data[i];   lor= row[i];   //b[lor] += v * x[pixel];   float_atomic_add(&b[lor], v * x[pixel]);     lor_x = symmlor_x[lor];   p = symm_xpixel[pixel];   //b[lor_x] += v * x[p];   float_atomic_add(&b[lor_x], v * x[p]);    lor_y = symmlor_y[lor];   p = symm_ypixel[pixel];   //b[lor_y] += v * x[p];   float_atomic_add(&b[lor_y], v * x[p]);    lor_xy = symmlor_xy[lor];   p = symm_xypixel[pixel];   //b[lor_xy] += v * x[p];   float_atomic_add(&b[lor_xy], v * x[p]);    // z symmetry.   lor_z = symmlor_z[lor];   pixel_z = symm_zpixel[pixel];   //b[lor_z] += v * x[pixel_z];   float_atomic_add(&b[lor_z], v * x[pixel_z]);    lor_x = symmlor_x[lor_z];   p = symm_xpixel[pixel_z];   //b[lor_x] += v * x[p];   float_atomic_add(&b[lor_x], v * x[p]);    lor_y = symmlor_y[lor_z];   p = symm_ypixel[pixel_z];   //b[lor_y] += v * x[p];   float_atomic_add(&b[lor_y], v * x[p]);    lor_xy = symmlor_xy[lor_z];   p = symm_xypixel[pixel_z];   //b[lor_xy] += v * x[p];   float_atomic_add(&b[lor_xy], v * x[p]);    } 

to use sparse matrix in coo format.