Fix Metal backend broken from the allocator changes (#2455)

author slaren <redacted>

Mon, 31 Jul 2023 09:02:53 +0000 (11:02 +0200)

committer GitHub <redacted>

Mon, 31 Jul 2023 09:02:53 +0000 (11:02 +0200)
author slaren <redacted>
Mon, 31 Jul 2023 09:02:53 +0000 (11:02 +0200)
committer GitHub <redacted>
Mon, 31 Jul 2023 09:02:53 +0000 (11:02 +0200)
diff --git a/llama.cpp b/llama.cpp

index 6f381f30f8fe1a33df0a8b4c90008143d1a991cd..50da4274fa2984b7251c21412e224b47b65533dd 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1812,6 +1812,12 @@ static bool llama_eval_internal(
      // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
      n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
  
+    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+
+    LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
+    LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
+
  #if GGML_USE_MPI
      const int64_t n_layer = hparams.n_layer;
      ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@@ -1825,7 +1831,10 @@ static bool llama_eval_internal(
          //}
          ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
          ggml_metal_graph_compute(lctx.ctx_metal, gf);
-        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
+        ggml_metal_get_tensor   (lctx.ctx_metal, res);
+        if (!lctx.embedding.empty()) {
+            ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
+        }
      } else {
          // IMPORTANT:
          // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1856,12 +1865,6 @@ static bool llama_eval_internal(
      // update kv token count
      lctx.kv_self.n = n_past + N;
  
-    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
-
-    LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
-    LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
-
      if (cgraph_fname) {
          ggml_graph_export(gf, cgraph_fname);
      }
author	slaren <redacted>
	Mon, 31 Jul 2023 09:02:53 +0000 (11:02 +0200)
committer	GitHub <redacted>
	Mon, 31 Jul 2023 09:02:53 +0000 (11:02 +0200)