message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
-if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
- message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$")
+ message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]")
endif()
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
if (GGML_SYCL_F16)
- if (GGML_SYCL_TARGET STREQUAL "AMD")
- message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
- endif()
add_compile_definitions(GGML_SYCL_F16)
endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
-elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
- add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-elseif (GGML_SYCL_TARGET STREQUAL "AMD")
- # INFO: Allowed Sub_group_sizes are not consistent through all
- # hip targets. For example, 64 is used for certain models, but the backend
- # does not support it.
- # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
- add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-else()
- # default for other target
- add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-endif()
-
-if (GGML_SYCL_GRAPH)
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
-endif()
-# Link against Intel oneMKL or oneMath
-if (GGML_SYCL_TARGET STREQUAL "INTEL")
- # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
- # See https://github.com/uxlfoundation/oneMath/issues/654
+ # Link against Intel oneMKL
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(SYCL_COMPILER ON)
endif()
find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
else()
- find_package(oneMath QUIET)
- if (NOT oneMath_FOUND)
- message(STATUS "oneMath not found: oneMath will be automatically downloaded")
- # Use FetchContent to automatically pull and build oneMath
- include(FetchContent)
- set(BUILD_FUNCTIONAL_TESTS False)
- set(BUILD_EXAMPLES False)
- set(TARGET_DOMAINS blas)
- if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
- set(ENABLE_MKLCPU_BACKEND False)
- set(ENABLE_MKLGPU_BACKEND False)
- set(ENABLE_CUBLAS_BACKEND True)
- elseif (GGML_SYCL_TARGET STREQUAL "AMD")
- set(ENABLE_MKLCPU_BACKEND False)
- set(ENABLE_MKLGPU_BACKEND False)
- set(ENABLE_ROCBLAS_BACKEND True)
- # Ensure setting a string variable here is not overriden by oneMath CACHE variables
- cmake_policy(SET CMP0126 NEW)
- # Setting the device architecture is only needed and useful for AMD devices in oneMath
- set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
- endif()
- FetchContent_Declare(
- ONEMATH
- GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
- GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
- )
- FetchContent_MakeAvailable(ONEMATH)
- # Create alias to match with find_package targets name
- function(onemath_alias target)
- if (TARGET ${target}_obj)
- # Silence verbose warnings from external libraries
- target_compile_options(${target}_obj PRIVATE -w)
- endif()
- if (TARGET ${target})
- add_library(ONEMATH::${target} ALIAS ${target})
- endif()
- endfunction()
- onemath_alias(onemath)
- onemath_alias(onemath_blas_mklcpu)
- onemath_alias(onemath_blas_mklgpu)
- onemath_alias(onemath_blas_cublas)
- onemath_alias(onemath_blas_rocblas)
- endif()
+ # default for other target
+ message(FATAL_ERROR "GGML_SYCL_TARGET is not supported")
+ add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+endif()
- # Below oneMath compile-time dispatching is used for better performance
- if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
- target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
- target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
- target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
- elseif (GGML_SYCL_TARGET STREQUAL "AMD")
- if (NOT GGML_SYCL_DEVICE_ARCH)
- message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
- endif()
- target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
- target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
- target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
- else()
- # Fallback to oneMath runtime dispatcher
- target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
- target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
- endif()
+if (GGML_SYCL_GRAPH)
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
endif()
if (GGML_SYCL_DEVICE_ARCH)
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
-#include <map>
-
-#ifdef GGML_SYCL_USE_INTEL_ONEMKL
#include <oneapi/mkl.hpp>
-// Allow to use the same namespace for Intel oneMKL and oneMath
-namespace oneapi {
- namespace math = mkl;
-}
-#else
-#include <oneapi/math.hpp>
-#endif
+
+#include <map>
#include "ggml.h"
}
template <typename Ts> struct matrix_info_t {
- oneapi::math::transpose transpose_info[2];
+ oneapi::mkl::transpose transpose_info[2];
Ts value_info[2];
std::int64_t size_info[3];
std::int64_t ld_info[3];
std::int64_t groupsize_info;
};
-inline auto get_onemath_backend(sycl::queue& queue)
-#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
- -> sycl::queue&
-#endif
-{
-// If the backend is known at compile-time, use oneMath backend_selector to use
-// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
-// fallback to runtime dispatching.
-#if defined(GGML_SYCL_NVIDIA)
- return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
-#elif defined(GGML_SYCL_AMD)
- return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
-#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
- return queue;
-#else
- static_assert(false, "Unsupported backend");
-#endif
-}
-
namespace dpct
{
typedef sycl::queue *queue_ptr;
namespace detail
{
template <class Ta, class Tb, class Tc, class Ts>
- inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+ inline void gemm_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
const void * beta, void * c, int ldc) {
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c);
- oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
+ oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a,
lda, data_b, ldb, beta_value, data_c, ldc);
}
};
template <class Ta, class Tb, class Tc, class Ts>
- inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
+ inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
int ldb, const void * beta, void ** c, int ldc, int batch_size,
matrix_info_t<float> * matrix_info) {
matrix_info->ld_info[2] = ldc;
matrix_info->groupsize_info = batch_size;
- sycl::event e = oneapi::math::blas::column_major::gemm_batch(
- get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
+ sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+ q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
}
template <class Ta, class Tb, class Tc, class Ts>
- inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
+ inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
int m, int n, int k, const void * alpha, const void * a, int lda,
long long int stride_a, const void * b, int ldb, long long int stride_b,
const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c);
- oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
+ oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value,
data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
data_c, ldc, stride_c, batch_size);
}
sycl::range<3>(x, y, 1), direction);
}
- inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
+ inline void gemm(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n,
int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
library_data_t scaling_type) {
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
- detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+ detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
break;
}
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
- detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+ detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
break;
}
/// \param [in] ldc Leading dimension of C.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors.
- inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+ inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
- detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
- detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
/// \param [in] stride_c Stride between the different C matrices.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors.
- inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+ inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
long long int stride_a, const void * b, library_data_t b_type, int ldb,
long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
- detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
batch_size);
break;
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
- detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+ detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
batch_size);
break;
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
- *stream, oneapi::math::transpose::trans,
- oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
+ *stream, oneapi::mkl::transpose::trans,
+ oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
dst_f16.get(), dpct::library_data_t::real_half, ldc,
{
const float alpha = 1.0f;
const float beta = 0.0f;
- SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
- get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
+ SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
+ *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
}
const int64_t smb = ne12 == 1 ? s13 : s12;
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
- oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans,
+ oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
});
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
- *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+ *queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // SYCL_USE_XMX
- // mmvq path is faster in the CUDA backend.
- if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
- // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
- // is enabled takes precedence over DMMV, the current if-else implementation
- // requires disabling DMMV if both conditions are met
- || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
+ // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
+ // is enabled takes precedence over DMMV, the current if-else implementation
+ // requires disabling DMMV if both conditions are met
+ if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
+ ggml_sycl_supports_reorder_mmvq(src0->type)))) {
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
}
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
ggml_sycl_set_device(device);
- /*
- DPCT1009:218: SYCL uses exceptions to report errors and does not use the
- error codes. The original code was commented out and a warning string was
- inserted. You need to rewrite this code.
- */
- /*
- DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
- device information which may not be supported by all compilers or runtimes.
- You may need to adjust the code.
- */
SYCL_CHECK(CHECK_TRY_ERROR(
dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
}