parallel processing - OpenCL barrier of finding max in a block -
i've found piece of opencl kernel sample code in nvidia's developer site purpose function maxoneblock
find out biggest value of array maxvalue
, store maxvalue[0].
i understand looping part, confused unroll
part: why unroll part not need sync thread after each step done?
e.g: when 1 thread done comparison of localid , localid+32, how ensure other thread have stored result localid+16?
the kernel code:
void maxoneblock(__local float maxvalue[], __local int maxind[]) { uint localid = get_local_id(0); uint localsize = get_local_size(0); int idx; float m1, m2, m3; (uint s = localsize/2; s > 32; s >>= 1) { if (localid < s) { m1 = maxvalue[localid]; m2 = maxvalue[localid+s]; m3 = (m1 >= m2) ? m1 : m2; idx = (m1 >= m2) ? localid : localid + s; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; } barrier(clk_local_mem_fence); } // unroll final warp reduce loop , sync overheads if (localid < 32) { m1 = maxvalue[localid]; m2 = maxvalue[localid+32]; m3 = (m1 > m2) ? m1 : m2; idx = (m1 > m2) ? localid : localid + 32; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; m1 = maxvalue[localid]; m2 = maxvalue[localid+16]; m3 = (m1 > m2) ? m1 : m2; idx = (m1 > m2) ? localid : localid + 16; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; m1 = maxvalue[localid]; m2 = maxvalue[localid+8]; m3 = (m1 > m2) ? m1 : m2; idx = (m1 > m2) ? localid : localid + 8; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; m1 = maxvalue[localid]; m2 = maxvalue[localid+4]; m3 = (m1 > m2) ? m1 : m2; idx = (m1 > m2) ? localid : localid + 4; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; m1 = maxvalue[localid]; m2 = maxvalue[localid+2]; m3 = (m1 > m2) ? m1 : m2; idx = (m1 > m2) ? localid : localid + 2; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; m1 = maxvalue[localid]; m2 = maxvalue[localid+1]; m3 = (m1 > m2) ? m1 : m2; idx = (m1 > m2) ? localid : localid + 1; maxvalue[localid] = m3; maxind[localid] = maxind[idx]; } }
why unroll part not need sync thread after each step done?
the sample incorrect, barrier indeed required after each step.
it looks sample written in warp-synchronous style, way of exploiting underlying execution mechanism of warps on nvidia hardware, incorrect synchronization cause break if underlying execution mechanism changes or in presence of compiler optimizations.
Comments
Post a Comment