* ggml-cuda: fix race condition in cumsum
* remove unneccesary sync_threads
// Add offset to each item and store
T thread_offset = thread_prefix - thread_sum + block_carry;
- #pragma unroll
+#pragma unroll
for (int i = 0; i < UNROLL_FACTOR; i++) {
int64_t idx = start + tid * UNROLL_FACTOR + i;
if (idx < ne00) {
}
}
+ __syncthreads();
+
// Update carry for next tile
if (tid == 0) {
block_carry += block_total;
}
- __syncthreads();
}
#else
NO_DEVICE_CODE;
}
}
+ __syncthreads();
+
// Update carry for next chunk
if (tid == 0) {
*s_carry += *s_chunk_total;
}
- __syncthreads();
}
}