}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
+#if defined(GGML_USE_HIP)
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
-#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
+#elif CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
-#elif !defined(GGML_USE_HIP)
+#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
return ret;
-#else
- GGML_UNUSED(a);
- GGML_UNUSED(b);
- NO_DEVICE_CODE;
#endif
}
template<int width = WARP_SIZE>
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
+#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
#pragma unroll
for (int offset = width/2; offset > 0; offset >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
+#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
}
#if CUDART_VERSION < CUDART_HMASK
#endif // defined(GGML_USE_HIP)
static ggml_cuda_device_info ggml_cuda_init() {
-#if defined(GGML_USE_HIP)
- // Workaround for a rocBLAS bug when using multiple graphics cards:
- // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
- {
- int major_version = 0;
- size_t version_length = 0;
- if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
- std::vector<char> version(version_length+1, '\0');
- if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
- version.resize(::strlen(version.data()));
- int parsed_value = 0;
- if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
- major_version = parsed_value;
- }
- }
- }
- if (major_version < 4) {
- GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
- rocblas_initialize();
- CUDA_CHECK(cudaDeviceSynchronize());
- }
- }
-#endif
-
ggml_cuda_device_info info = {};
cudaError_t err = cudaGetDeviceCount(&info.device_count);
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h>
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
}
return c;
}
-
-#if HIP_VERSION < 50600000
-// __shfl_xor() for half2 was added in ROCm 5.6
-static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
- typedef union half2_b32 {
- half2 val;
- int b32;
- } half2_b32_t;
- half2_b32_t tmp;
- tmp.val = var;
- tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
- return tmp.val;
-}
-#endif // HIP_VERSION < 50600000
endif()
endif()
-if (${hip_VERSION} VERSION_LESS 5.5)
- message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
+if (${hip_VERSION} VERSION_LESS 6.1)
+ message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
endif()
message(STATUS "HIP and hipBLAS found")