CUDA loop unrolling on triangular region -


is possible unroll loop on triangular region, such as:

for (int = 0; < row_length; i++) {     (int j = 0; j < i; j++)     {         // array operation here     } } 

where row_length constant defined @ compile time? stands now, don't think possible because changing program executes (and more importantly, it's not constant @ compile time). suppose treat 2d array 1d array, iterate 0 (row_length^2)/2, , try couple math tricks indices, operations defeat purpose of loop unrolling in first place.

the cuda 7.0 compiler unroll in test. loop indices known @ compile time there's no reason why shouldn't able to.

consider following code, sets triangular portion of 1.

#define row_length 4 __global__ void triunrolltest1(float* a) {    #pragma unroll    (int = 0; < row_length; i++)    {       #pragma unroll       (int j = 0; j < i; j++)       {          a[i * row_length + j] = 1.f;       }    } } 

as row_length 4 can unroll ourselves:

__global__ void triunrolltest2(float* a) {    a[1 * row_length + 0] = 1.f;    a[2 * row_length + 0] = 1.f;    a[2 * row_length + 1] = 1.f;    a[3 * row_length + 0] = 1.f;    a[3 * row_length + 1] = 1.f;    a[3 * row_length + 2] = 1.f; } 

compiling sm 35 using cuda 7.0: nvcc -arch=sm_35 -c triunroll.cu

then dumping sass assembler: cuobjdump --dump-sass triunroll.o

we get:

code sm_35         function : _z14triunrolltest1pf .headerflags    @"ef_cuda_sm35 ef_cuda_ptx_sm(ef_cuda_sm35)"                                                           /* 0x08b8b8a0b010a000 */ /*0008*/                   mov r1, c[0x0][0x44];          /* 0x64c03c00089c0006 */ /*0010*/                   mov r0, c[0x0][0x140];         /* 0x64c03c00281c0002 */ /*0018*/                   iadd r2.cc, r0, 0x10;          /* 0xc0840000081c0009 */ /*0020*/                   mov32i r0, 0x3f800000;         /* 0x741fc000001fc002 */ /*0028*/                   iadd.x r3, rz, c[0x0][0x144];  /* 0x60804000289ffc0e */ /*0030*/                   st.e [r2], r0;                 /* 0xe4800000001c0800 */ /*0038*/                   st.e [r2+0x10], r0;            /* 0xe4800000081c0800 */                                                           /* 0x080000b810b8b8b8 */ /*0048*/                   st.e [r2+0x14], r0;            /* 0xe48000000a1c0800 */ /*0050*/                   st.e [r2+0x20], r0;            /* 0xe4800000101c0800 */ /*0058*/                   st.e [r2+0x24], r0;            /* 0xe4800000121c0800 */ /*0060*/                   st.e [r2+0x28], r0;            /* 0xe4800000141c0800 */ /*0068*/                   exit;                          /* 0x18000000001c003c */ /*0070*/                   bra 0x70;                      /* 0x12007ffffc1c003c */ /*0078*/                   nop;                           /* 0x85800000001c3c02 */         .....................................           function : _z14triunrolltest2pf .headerflags    @"ef_cuda_sm35 ef_cuda_ptx_sm(ef_cuda_sm35)"                                                           /* 0x08b8b8a0b010a000 */ /*0008*/                   mov r1, c[0x0][0x44];          /* 0x64c03c00089c0006 */ /*0010*/                   mov r0, c[0x0][0x140];         /* 0x64c03c00281c0002 */ /*0018*/                   iadd r2.cc, r0, 0x10;          /* 0xc0840000081c0009 */ /*0020*/                   mov32i r0, 0x3f800000;         /* 0x741fc000001fc002 */ /*0028*/                   iadd.x r3, rz, c[0x0][0x144];  /* 0x60804000289ffc0e */ /*0030*/                   st.e [r2], r0;                 /* 0xe4800000001c0800 */ /*0038*/                   st.e [r2+0x10], r0;            /* 0xe4800000081c0800 */                                                           /* 0x080000b810b8b8b8 */ /*0048*/                   st.e [r2+0x14], r0;            /* 0xe48000000a1c0800 */ /*0050*/                   st.e [r2+0x20], r0;            /* 0xe4800000101c0800 */ /*0058*/                   st.e [r2+0x24], r0;            /* 0xe4800000121c0800 */ /*0060*/                   st.e [r2+0x28], r0;            /* 0xe4800000141c0800 */ /*0068*/                   exit;                          /* 0x18000000001c003c */ /*0070*/                   bra 0x70;                      /* 0x12007ffffc1c003c */ /*0078*/                   nop;                           /* 0x85800000001c3c02 */         ..................................... 

obviously both same , nicely unrolled. interestingly when accidentally compiled 6.5 first answer compiler did not unroll, guess pays date in case!