metal : adaptive CPU/GPU interleave based on number of nodes (llama/19369)

author Georgi Gerganov <redacted>

Thu, 5 Feb 2026 17:07:22 +0000 (19:07 +0200)

committer Georgi Gerganov <redacted>

Sat, 7 Feb 2026 08:37:38 +0000 (10:37 +0200)
author Georgi Gerganov <redacted>
Thu, 5 Feb 2026 17:07:22 +0000 (19:07 +0200)
committer Georgi Gerganov <redacted>
Sat, 7 Feb 2026 08:37:38 +0000 (10:37 +0200)
diff --git a/src/ggml-metal/ggml-metal-context.m b/src/ggml-metal/ggml-metal-context.m

index a412d70aed5fbbc7d6280606178e056484f7780e..c7e8ebd3f32e70600b7c5989aa62985c9dfa3957 100644 (file)
--- a/src/ggml-metal/ggml-metal-context.m
+++ b/src/ggml-metal/ggml-metal-context.m
@@ -415,7 +415,7 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
  
  enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
      // number of nodes encoded by the main thread (empirically determined)
-    const int n_main = 64;
+    const int n_main = MAX(64, 0.1*gf->n_nodes);
  
      // number of threads in addition to the main thread
      const int n_cb = ctx->n_cb;
author	Georgi Gerganov <redacted>
	Thu, 5 Feb 2026 17:07:22 +0000 (19:07 +0200)
committer	Georgi Gerganov <redacted>
	Sat, 7 Feb 2026 08:37:38 +0000 (10:37 +0200)