add_compile_definitions(GGML_SYCL_F16)
endif()
-if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
+ add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
+ target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
+elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
# INFO: Allowed Sub_group_sizes are not consistent through all
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else()
- add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
+ # default for other target
+ add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
endif()
if (GGML_SYCL_GRAPH)
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
- GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
- GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
GGML_TENSOR_BINARY_OP_LOCALS01;
SYCL_CHECK(ggml_sycl_set_device(ctx.device));