float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
}
+ } else if (np > 1) {
+ // Warps with threadIdx.y % np == 0 execute a __syncthreads() in the if branch.
+ // Therefore, all other warps also need to execute a __syncthreads().
+ // Otherwise the points at which warps synchronize with each other would become misaligned.
+ __syncthreads();
}
#pragma unroll