ggml : add GGML_SCHED_NO_REALLOC option to disable reallocations in ggml_backend_sche...

author Diego Devesa <redacted>

Fri, 28 Nov 2025 15:33:23 +0000 (07:33 -0800)

committer GitHub <redacted>

Fri, 28 Nov 2025 15:33:23 +0000 (17:33 +0200)
author Diego Devesa <redacted>
Fri, 28 Nov 2025 15:33:23 +0000 (07:33 -0800)
committer GitHub <redacted>
Fri, 28 Nov 2025 15:33:23 +0000 (17:33 +0200)
diff --git a/ci/run.sh b/ci/run.sh

index 3fec8e9110f0a59f7763fa8781597aa49f33425c..1dd65adeaace2f05723e62c96bb57fe18ecc9f0e 100755 (executable)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ sd=`dirname $0`
  cd $sd/../
  SRC=`pwd`
  
-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
  
  if [ ! -z ${GG_BUILD_METAL} ]; then
      CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -428,10 +428,10 @@ function gg_run_qwen3_0_6b {
  
      (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
  
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
  
      function check_ppl {
          qnt="$1"
@@ -523,8 +523,8 @@ function gg_run_embd_bge_small {
  
      ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
  
-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
  
      set +e
  }
@@ -564,7 +564,7 @@ function gg_run_rerank_tiny {
      model_f16="${path_models}/ggml-model-f16.gguf"
  
      # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
  
      # sample output
      # rerank score 0:    0.029
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp

index 9e3ab5905bb37e48f791b4644917967d5e4bba34..fe91b308cdc0af4f8df3be88c15dff5558f3861d 100644 (file)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -104,12 +104,16 @@ int main(int argc, char ** argv) {
  
      params.embedding = true;
  
+    // get max number of sequences per batch
+    const int n_seq_max = llama_max_parallel_sequences();
+
      // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
      //   --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
      //   in order to support any number of prompts
      if (params.n_parallel == 1) {
          LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
          params.kv_unified = true;
+        params.n_parallel = n_seq_max;
      }
  
      // utilize the full context
@@ -123,9 +127,6 @@ int main(int argc, char ** argv) {
          params.n_ubatch = params.n_batch;
      }
  
-    // get max number of sequences per batch
-    const int n_seq_max = llama_max_parallel_sequences();
-
      llama_backend_init();
      llama_numa_init(params.numa);
  
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt

index 0211255a762a892abff80bdbbebcfd3839f92748..9b10df00dae30930d87d5ce6bbb51602774fa86b 100644 (file)
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -183,6 +183,7 @@ endif()
  # ggml core
  set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
  option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
+option(GGML_SCHED_NO_REALLOC                "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
  
  # 3rd party libs / backends
  option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt

index a4499509ece4d1c5696eb0d5dc1eda9cc1751457..a36f5b664780d63e58c2a290670f00c1c1076ba1 100644 (file)
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -221,6 +221,10 @@ if (GGML_BACKEND_DL)
      target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
  endif()
  
+if (GGML_SCHED_NO_REALLOC)
+    target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
+endif()
+
  add_library(ggml
              ggml-backend-reg.cpp)
  add_library(ggml::ggml ALIAS ggml)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c

index 91aff205f1832c73edf9f2fc6e7c0fdca1d6f769..218222ece8726d6bbe3bcaa1ee5d6f03f07ccdc6 100644 (file)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -921,10 +921,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
          }
          if (realloc) {
  #ifndef NDEBUG
-            size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
-            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            {
+                size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
+                if (cur_size > 0) {
+                    GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
+                        __func__, ggml_backend_buft_name(galloc->bufts[i]),
+                        cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+                }
+            }
  #endif
-
              ggml_vbuffer_free(galloc->buffers[i]);
              galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
              if (galloc->buffers[i] == NULL) {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp

index eeaf35c169fac3d65bfa6ca5f78d369657c9d748..4cf377e7f3308fcf233a33dd87ff5c7ca7d973d7 100644 (file)
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1395,14 +1395,20 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
  
      // allocate graph
      if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
+#ifdef GGML_SCHED_NO_REALLOC
+        GGML_ABORT("%s: failed to allocate graph, but graph re-allocation is disabled by GGML_SCHED_NO_REALLOC\n", __func__);
+#endif
+
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
+#endif
+
          // the re-allocation may cause the split inputs to be moved to a different address
          // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
          for (int i = 0; i < sched->n_backends; i++) {
              ggml_backend_synchronize(sched->backends[i]);
          }
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
-#endif
+
          ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
          if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
              GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index a58914429c82a12b4c19784a9a6bd7ee2f3dbfd4..e04f0fc4f9aeda73fa9760ee44e9a290a50d6fd2 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -300,7 +300,7 @@ llama_context::llama_context(
  
          cross.v_embd.clear();
  
-        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
+        const uint32_t n_seqs = cparams.n_seq_max;
          const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  
          // avoid reserving graphs with zero outputs - assume one output per sequence
@@ -543,7 +543,7 @@ bool llama_context::memory_update(bool optimize) {
              throw std::runtime_error("failed to initialize memory context");
          }
  
-        const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
+        const uint32_t n_seqs = cparams.n_seq_max;
          const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  
          auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index d9cc5e933f4ce7785475a6eac02be395ce8556df..9361a113a19bfea90130b7f146476531eef55963 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -196,7 +196,7 @@ if (NOT WIN32)
      llama_build_and_test(test-arg-parser.cpp)
  endif()
  
-if (NOT LLAMA_SANITIZE_ADDRESS)
+if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
    # TODO: repair known memory leaks
    llama_build_and_test(test-opt.cpp)
  endif()
author	Diego Devesa <redacted>
	Fri, 28 Nov 2025 15:33:23 +0000 (07:33 -0800)
committer	GitHub <redacted>
	Fri, 28 Nov 2025 15:33:23 +0000 (17:33 +0200)
ci/run.sh		patch \| blob \| history
examples/embedding/embedding.cpp		patch \| blob \| history
ggml/CMakeLists.txt		patch \| blob \| history
ggml/src/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-alloc.c		patch \| blob \| history
ggml/src/ggml-backend.cpp		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
tests/CMakeLists.txt		patch \| blob \| history