i need resolve sparse matrix-vector multiplication in opencl, time have when de kernel finish slow. assume slow time because in kernel exist atomic-add function.
this de code of kernel:
#pragma opencl extension cl_khr_global_int32_base_atomics : enable void float_atomic_add(__global float *loc, const float f){ private float old = *loc; private float sum = old + f; while(atomic_cmpxchg((__global int*)loc, *((int*)&old), *((int*)&sum)) != *((int*)&old)){ old = *loc; sum = old + f; } } __kernel void forward(__global int* col, __global int* row, __global float* data, __global int* symmlor_x, __global int* symm_xpixel,__global int* symmlor_y, __global int* symm_ypixel, __global int* symmlor_xy,__global int* symm_xypixel, __global int* symmlor_z, __global int* symm_zpixel, __global float* x, __global float* b){ __private int = get_global_id(0); __private int p, pixel,lor, lor_y, lor_x, lor_xy; __private int lor_z, pixel_z; __private float v; pixel = col[i]; // j v = data[i]; lor= row[i]; //b[lor] += v * x[pixel]; float_atomic_add(&b[lor], v * x[pixel]); lor_x = symmlor_x[lor]; p = symm_xpixel[pixel]; //b[lor_x] += v * x[p]; float_atomic_add(&b[lor_x], v * x[p]); lor_y = symmlor_y[lor]; p = symm_ypixel[pixel]; //b[lor_y] += v * x[p]; float_atomic_add(&b[lor_y], v * x[p]); lor_xy = symmlor_xy[lor]; p = symm_xypixel[pixel]; //b[lor_xy] += v * x[p]; float_atomic_add(&b[lor_xy], v * x[p]); // z symmetry. lor_z = symmlor_z[lor]; pixel_z = symm_zpixel[pixel]; //b[lor_z] += v * x[pixel_z]; float_atomic_add(&b[lor_z], v * x[pixel_z]); lor_x = symmlor_x[lor_z]; p = symm_xpixel[pixel_z]; //b[lor_x] += v * x[p]; float_atomic_add(&b[lor_x], v * x[p]); lor_y = symmlor_y[lor_z]; p = symm_ypixel[pixel_z]; //b[lor_y] += v * x[p]; float_atomic_add(&b[lor_y], v * x[p]); lor_xy = symmlor_xy[lor_z]; p = symm_xypixel[pixel_z]; //b[lor_xy] += v * x[p]; float_atomic_add(&b[lor_xy], v * x[p]); }
to use sparse matrix in coo format.