#include <map>
#include <array>
+// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
+#define STRINGIZE_IMPL(...) #__VA_ARGS__
+#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
+
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
[[noreturn]]
-static __device__ void bad_arch() {
- printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+static __device__ void no_device_code(
+ const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+ printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+ file_name, line, function_name, arch);
+ (void) arch_list;
+#else
+ printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+ file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
- (void) bad_arch; // suppress unused function warning
+ (void) no_device_code; // suppress unused function warning
}
+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
return a;
#else
(void) a;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
return x;
#else
(void) x;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
}
}
#else
(void) vx; (void) y; (void) k;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_PASCAL
}
// second part effectively subtracts 8 from each quant value
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// second part effectively subtracts 16 from each quant value
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d8_0*d8_1 * sumi;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm2f.x*sumf_d - dm2f.y*sumf_m;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d3 * sumf;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d3*d8 * sumi;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm5f.x*sumf_d - dm5f.y*sumf_m;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dm4f.x*sumf_d - dm4f.y*sumf_m;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d*sumf;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return d6 * sumf_d;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
return dall * sumf_d - dmin * sumf_m;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
return d * sumf_d;
#else
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_0_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_1_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_0_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_1_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q8_0_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q2_K_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q3_K_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q4_K_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q5_K_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#else
(void) vec_dot_q6_K_q8_1_mul_mat;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= CC_VOLTA
}
}
#else
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
- bad_arch();
+ NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
}