return a;
}
-//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-//#pragma unroll
-// for (int mask = 16; mask > 0; mask >>= 1) {
-// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-// }
-// return a;
-//#else
-// (void) a;
-// NO_DEVICE_CODE;
-//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-//}
+#ifdef GGML_CUDA_F16
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+ }
+ return a;
+#else
+ (void) a;
+ NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+#endif // GGML_CUDA_F16
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp;
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp;
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (tid == 0) {
dst[row] = tmp;
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[row] = tmp;
#endif
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (tid == 0) {
dst[row] = tmp;
float amax = fabsf(xi);
float sum = xi;
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
- sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
- }
+ amax = warp_reduce_max(amax);
+ sum = warp_reduce_sum(sum);
const float d = amax / 127;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
}
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (tid == 0) {
#ifdef GGML_CUDA_F16
const int idst = channel*nrows_dst + row_dst;
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[idst] = tmp;
}
// sum up partial sums and write back result
-#pragma unroll
- for (int mask = 16; mask > 0; mask >>= 1) {
- tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
- }
+ tmp = warp_reduce_sum(tmp);
if (threadIdx.x == 0) {
dst[idst] = tmp;