From: Neo Zhang Date: Mon, 2 Feb 2026 13:06:21 +0000 (+0800) Subject: Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU... X-Git-Tag: v0.9.7~67 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=4c3c93f7185daebe8747d3e8b0728a4494d4bcc9;p=pkg%2Fggml%2Fsources%2Fggml Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. (llama/19246) User can't build up the software for Nvidia & AMD GPU. rm the oneMath since it is only used in NV and AMD code path. --- diff --git a/src/ggml-sycl/CMakeLists.txt b/src/ggml-sycl/CMakeLists.txt index 5a89d8dd..eefdd972 100644 --- a/src/ggml-sycl/CMakeLists.txt +++ b/src/ggml-sycl/CMakeLists.txt @@ -1,7 +1,7 @@ message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}") -if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD") +if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$") + message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]") endif() check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) @@ -125,106 +125,27 @@ endif() target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL}) if (GGML_SYCL_F16) - if (GGML_SYCL_TARGET STREQUAL "AMD") - message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.") - endif() add_compile_definitions(GGML_SYCL_F16) endif() if (GGML_SYCL_TARGET STREQUAL "INTEL") add_compile_definitions(GGML_SYCL_WARP_SIZE=16) target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required) -elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") - add_compile_definitions(GGML_SYCL_WARP_SIZE=32) -elseif (GGML_SYCL_TARGET STREQUAL "AMD") - # INFO: Allowed Sub_group_sizes are not consistent through all - # hip targets. For example, 64 is used for certain models, but the backend - # does not support it. - # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32) - add_compile_definitions(GGML_SYCL_WARP_SIZE=32) -else() - # default for other target - add_compile_definitions(GGML_SYCL_WARP_SIZE=32) -endif() - -if (GGML_SYCL_GRAPH) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH) -endif() -# Link against Intel oneMKL or oneMath -if (GGML_SYCL_TARGET STREQUAL "INTEL") - # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically - # See https://github.com/uxlfoundation/oneMath/issues/654 + # Link against Intel oneMKL if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set(SYCL_COMPILER ON) endif() find_package(MKL REQUIRED) target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL) else() - find_package(oneMath QUIET) - if (NOT oneMath_FOUND) - message(STATUS "oneMath not found: oneMath will be automatically downloaded") - # Use FetchContent to automatically pull and build oneMath - include(FetchContent) - set(BUILD_FUNCTIONAL_TESTS False) - set(BUILD_EXAMPLES False) - set(TARGET_DOMAINS blas) - if (GGML_SYCL_TARGET STREQUAL "NVIDIA") - set(ENABLE_MKLCPU_BACKEND False) - set(ENABLE_MKLGPU_BACKEND False) - set(ENABLE_CUBLAS_BACKEND True) - elseif (GGML_SYCL_TARGET STREQUAL "AMD") - set(ENABLE_MKLCPU_BACKEND False) - set(ENABLE_MKLGPU_BACKEND False) - set(ENABLE_ROCBLAS_BACKEND True) - # Ensure setting a string variable here is not overriden by oneMath CACHE variables - cmake_policy(SET CMP0126 NEW) - # Setting the device architecture is only needed and useful for AMD devices in oneMath - set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE) - endif() - FetchContent_Declare( - ONEMATH - GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git - GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142 - ) - FetchContent_MakeAvailable(ONEMATH) - # Create alias to match with find_package targets name - function(onemath_alias target) - if (TARGET ${target}_obj) - # Silence verbose warnings from external libraries - target_compile_options(${target}_obj PRIVATE -w) - endif() - if (TARGET ${target}) - add_library(ONEMATH::${target} ALIAS ${target}) - endif() - endfunction() - onemath_alias(onemath) - onemath_alias(onemath_blas_mklcpu) - onemath_alias(onemath_blas_mklgpu) - onemath_alias(onemath_blas_cublas) - onemath_alias(onemath_blas_rocblas) - endif() + # default for other target + message(FATAL_ERROR "GGML_SYCL_TARGET is not supported") + add_compile_definitions(GGML_SYCL_WARP_SIZE=32) +endif() - # Below oneMath compile-time dispatching is used for better performance - if (GGML_SYCL_TARGET STREQUAL "NVIDIA") - target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas) - target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda") - target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda") - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA) - elseif (GGML_SYCL_TARGET STREQUAL "AMD") - if (NOT GGML_SYCL_DEVICE_ARCH) - message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") - endif() - target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas) - target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") - target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD) - else() - # Fallback to oneMath runtime dispatcher - target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC) - endif() +if (GGML_SYCL_GRAPH) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH) endif() if (GGML_SYCL_DEVICE_ARCH) diff --git a/src/ggml-sycl/dpct/helper.hpp b/src/ggml-sycl/dpct/helper.hpp index 8ae80987..ece66a7a 100644 --- a/src/ggml-sycl/dpct/helper.hpp +++ b/src/ggml-sycl/dpct/helper.hpp @@ -15,17 +15,9 @@ #include #include -#include - -#ifdef GGML_SYCL_USE_INTEL_ONEMKL #include -// Allow to use the same namespace for Intel oneMKL and oneMath -namespace oneapi { - namespace math = mkl; -} -#else -#include -#endif + +#include #include "ggml.h" @@ -91,32 +83,13 @@ inline std::string get_device_backend_and_type(const sycl::device &device) { } template struct matrix_info_t { - oneapi::math::transpose transpose_info[2]; + oneapi::mkl::transpose transpose_info[2]; Ts value_info[2]; std::int64_t size_info[3]; std::int64_t ld_info[3]; std::int64_t groupsize_info; }; -inline auto get_onemath_backend(sycl::queue& queue) -#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL) - -> sycl::queue& -#endif -{ -// If the backend is known at compile-time, use oneMath backend_selector to use -// compile-time dispatching and avoid the need to dlopen libraries. Otherwise -// fallback to runtime dispatching. -#if defined(GGML_SYCL_NVIDIA) - return oneapi::math::backend_selector{ queue }; -#elif defined(GGML_SYCL_AMD) - return oneapi::math::backend_selector{ queue }; -#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL) - return queue; -#else - static_assert(false, "Unsupported backend"); -#endif -} - namespace dpct { typedef sycl::queue *queue_ptr; @@ -1734,7 +1707,7 @@ namespace dpct namespace detail { template - inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, + inline void gemm_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb, const void * beta, void * c, int ldc) { Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); @@ -1742,7 +1715,7 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); - oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, + oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, beta_value, data_c, ldc); } @@ -1774,7 +1747,7 @@ namespace dpct }; template - inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, + inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b, int ldb, const void * beta, void ** c, int ldc, int batch_size, matrix_info_t * matrix_info) { @@ -1793,8 +1766,8 @@ namespace dpct matrix_info->ld_info[2] = ldc; matrix_info->groupsize_info = batch_size; - sycl::event e = oneapi::math::blas::column_major::gemm_batch( - get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1, + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast(matrix_info->value_info), reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), matrix_info->ld_info + 1, @@ -1803,7 +1776,7 @@ namespace dpct } template - inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, + inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n, int k, const void * alpha, const void * a, int lda, long long int stride_a, const void * b, int ldb, long long int stride_b, const void * beta, void * c, int ldc, long long int stride_c, int batch_size) { @@ -1812,7 +1785,7 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); - oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, + oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c, batch_size); } @@ -2299,7 +2272,7 @@ namespace dpct sycl::range<3>(x, y, 1), direction); } - inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n, + inline void gemm(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b, library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc, library_data_t scaling_type) { @@ -2366,7 +2339,7 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float, library_data_t::real_float): { - detail::gemm_impl( + detail::gemm_impl( q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); break; } @@ -2405,7 +2378,7 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float): { - detail::gemm_impl( + detail::gemm_impl( q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); break; } @@ -2447,7 +2420,7 @@ namespace dpct /// \param [in] ldc Leading dimension of C. /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, + inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda, const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[], library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type, @@ -2485,7 +2458,7 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float): { - detail::gemm_batch_impl( + detail::gemm_batch_impl( q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } @@ -2493,7 +2466,7 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float, library_data_t::real_float): { - detail::gemm_batch_impl( + detail::gemm_batch_impl( q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); break; } @@ -2569,7 +2542,7 @@ namespace dpct /// \param [in] stride_c Stride between the different C matrices. /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, + inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda, long long int stride_a, const void * b, library_data_t b_type, int ldb, long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc, @@ -2642,7 +2615,7 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float): { - detail::gemm_batch_impl( + detail::gemm_batch_impl( q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); break; @@ -2651,7 +2624,7 @@ namespace dpct library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float, library_data_t::real_float): { - detail::gemm_batch_impl( + detail::gemm_batch_impl( q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); break; diff --git a/src/ggml-sycl/ggml-sycl.cpp b/src/ggml-sycl/ggml-sycl.cpp index c5139fd3..a03d26d7 100644 --- a/src/ggml-sycl/ggml-sycl.cpp +++ b/src/ggml-sycl/ggml-sycl.cpp @@ -2167,8 +2167,8 @@ inline void ggml_sycl_op_mul_mat_sycl( const sycl::half alpha_f16 = 1.0f; const sycl::half beta_f16 = 0.0f; SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( - *stream, oneapi::math::transpose::trans, - oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10, + *stream, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, dst_f16.get(), dpct::library_data_t::real_half, ldc, @@ -2211,8 +2211,8 @@ inline void ggml_sycl_op_mul_mat_sycl( { const float alpha = 1.0f; const float beta = 0.0f; - SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm( - get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff, + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); } @@ -3165,8 +3165,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons const int64_t smb = ne12 == 1 ? s13 : s12; // there is no broadcast and src0, src1 are contiguous across dims 2, 3 - SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans, - oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha, + SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma, src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf, mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type))); @@ -3190,7 +3190,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons }); SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( - *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha, + *queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00, (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta, (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get()))); @@ -3524,12 +3524,11 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); #endif // SYCL_USE_XMX - // mmvq path is faster in the CUDA backend. - if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda - // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization - // is enabled takes precedence over DMMV, the current if-else implementation - // requires disabling DMMV if both conditions are met - || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) { + // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization + // is enabled takes precedence over DMMV, the current if-else implementation + // requires disabling DMMV if both conditions are met + if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) && + ggml_sycl_supports_reorder_mmvq(src0->type)))) { use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; } @@ -4189,16 +4188,6 @@ void ggml_backend_sycl_get_device_memory(int device, size_t *free, GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n"); ggml_sycl_set_device(device); - /* - DPCT1009:218: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string was - inserted. You need to rewrite this code. - */ - /* - DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for - device information which may not be supported by all compilers or runtimes. - You may need to adjust the code. - */ SYCL_CHECK(CHECK_TRY_ERROR( dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total))); } diff --git a/src/ggml-sycl/outprod.cpp b/src/ggml-sycl/outprod.cpp index 3a17f3a1..f52b11f0 100644 --- a/src/ggml-sycl/outprod.cpp +++ b/src/ggml-sycl/outprod.cpp @@ -32,12 +32,12 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { // Handle transposition of src1 const bool src1_T = ggml_is_transposed(src1); - const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans; + const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans; const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); try { - // Perform matrix multiplication using oneMath GEMM - oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op, + // Perform matrix multiplication using oneMKL GEMM + oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0); } catch (sycl::exception const& exc) { diff --git a/src/ggml-sycl/rope.cpp b/src/ggml-sycl/rope.cpp index 69140b19..aeaa58b9 100644 --- a/src/ggml-sycl/rope.cpp +++ b/src/ggml-sycl/rope.cpp @@ -207,7 +207,6 @@ static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, cons const int p = sector; theta_base = pos[channel_x] * sycl::pow(theta_scale, (float) p); } else { - // Simplified from CUDA backend code: if (sector >= sections.v[0] && sector < sec_w) which is just sector >= sections.v[0] const int p = sector - sections.v[0]; theta_base = pos[channel_x + ne2] * sycl::pow(theta_scale, (float) p); } diff --git a/src/ggml-sycl/wkv.cpp b/src/ggml-sycl/wkv.cpp index c10e2f76..b56e0c24 100644 --- a/src/ggml-sycl/wkv.cpp +++ b/src/ggml-sycl/wkv.cpp @@ -1,7 +1,7 @@ #include #include "wkv.hpp" -constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE +constexpr int WKV_BLOCK_SIZE = 64; // Helper function for the main kernel template