sycl : support to malloc memory on device more than 4GB, update the doc and script...

author Neo Zhang <redacted>

Sat, 29 Nov 2025 12:59:44 +0000 (20:59 +0800)

committer GitHub <redacted>

Sat, 29 Nov 2025 12:59:44 +0000 (14:59 +0200)
author Neo Zhang <redacted>
Sat, 29 Nov 2025 12:59:44 +0000 (20:59 +0800)
committer GitHub <redacted>
Sat, 29 Nov 2025 12:59:44 +0000 (14:59 +0200)
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md

index 92ab27066b4a5ffe804113acc66d95a7f9ea9625..02a72a9d51546b7f38b8784febdeec15e3319140 100644 (file)
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -42,6 +42,9 @@ The following releases are verified and recommended:
  
  ## News
  
+- 2025.11
+  - Support malloc memory on device more than 4GB.
+
  - 2025.2
    - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
      |GPU|Base tokens/s|Increased tokens/s|Percent|
@@ -789,6 +792,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
  | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
  | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+
  
  
  ## Known Issues
@@ -835,6 +840,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
    | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
    | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
  
+- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
+
+  You need to enable to support 4GB memory malloc by:
+  ```
+    export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+    set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+  ```
+
  ### **GitHub contribution**:
  Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
  
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh

index 37195008de70fd3cc6495bc6971dad09f847c909..a018e4519724c0dd9991f189b7bbe9a7df6f240e 100755 (executable)
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -15,6 +15,9 @@ MODEL_FILE=models/llama-2-7b.Q4_0.gguf
  NGL=99
  CONTEXT=4096
  
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+
  if [ $# -gt 0 ]; then
      GGML_SYCL_DEVICE=$1
      echo "use $GGML_SYCL_DEVICE as main GPU"
diff --git a/examples/sycl/run-llama3.sh b/examples/sycl/run-llama3.sh

index 8e21b017f4ca55c56da1811966246c42804cab95..47702557031cd7a0c1374a72b036892544f428ec 100755 (executable)
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@@ -6,7 +6,7 @@
  
  # If you want more control, DPC++ Allows selecting a specific device through the
  # following environment variable
-#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+export ONEAPI_DEVICE_SELECTOR="level_zero:0"
  source /opt/intel/oneapi/setvars.sh
  
  #export GGML_SYCL_DEBUG=1
@@ -18,11 +18,14 @@ MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
  NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
  CONTEXT=4096
  
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+
  if [ $# -gt 0 ]; then
      GGML_SYCL_DEVICE=$1
      echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
  else
      #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
  fi
diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat

index d7564f4161ca27db04d5f80d74ffd5ab7a58acf8..b654f88f62c8d0f38ff49b6f6a399c37b653abe2 100644 (file)
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -5,5 +5,7 @@
  set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
  @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
  
+:: support malloc device memory more than 4GB.
+set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
  
  .\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
diff --git a/examples/sycl/win-run-llama3.bat b/examples/sycl/win-run-llama3.bat

index 4b61aebee5588d52b1f62952d7a788cf3cef1e54..608b834f60e47c9c362a147cacabe145b07263e1 100644 (file)
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@@ -5,5 +5,7 @@
  set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
  @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
  
+:: support malloc device memory more than 4GB.
+set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
  
-.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
+.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt

index efd78b912cc65cb66fccb0a10b956e44eeafd8a7..88f29221bba94e6deefe40d35afe3894086d4c3f 100644 (file)
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -91,7 +91,10 @@ if (GGML_SYCL_F16)
      add_compile_definitions(GGML_SYCL_F16)
  endif()
  
-if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
+    target_link_options(ggml-sycl PRIVATE  -Xs   -ze-intel-greater-than-4GB-buffer-required)
+elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
      add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
  elseif (GGML_SYCL_TARGET STREQUAL "AMD")
      # INFO: Allowed Sub_group_sizes are not consistent through all
@@ -100,7 +103,8 @@ elseif (GGML_SYCL_TARGET STREQUAL "AMD")
      # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
      add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
  else()
-    add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
+    # default for other target
+    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
  endif()
  
  if (GGML_SYCL_GRAPH)
diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp

index 1ec99b0a5d1335f230e518b8158732074256590d..96709554cf692a9e28306496b1779fb36b31cf1a 100644 (file)
--- a/ggml/src/ggml-sycl/cpy.cpp
+++ b/ggml/src/ggml-sycl/cpy.cpp
@@ -515,9 +515,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
      const int64_t ne = ggml_nelements(src0);
      GGML_ASSERT(ne == ggml_nelements(src1));
  
-    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
-    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
      GGML_TENSOR_BINARY_OP_LOCALS01;
  
      SYCL_CHECK(ggml_sycl_set_device(ctx.device));
author	Neo Zhang <redacted>
	Sat, 29 Nov 2025 12:59:44 +0000 (20:59 +0800)
committer	GitHub <redacted>
	Sat, 29 Nov 2025 12:59:44 +0000 (14:59 +0200)
docs/backend/SYCL.md		patch \| blob \| history
examples/sycl/run-llama2.sh		patch \| blob \| history
examples/sycl/run-llama3.sh		patch \| blob \| history
examples/sycl/win-run-llama2.bat		patch \| blob \| history
examples/sycl/win-run-llama3.bat		patch \| blob \| history
ggml/src/ggml-sycl/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-sycl/cpy.cpp		patch \| blob \| history