metal : enable ggml-alloc (#2627)

author Shouzheng Liu <redacted>

Wed, 16 Aug 2023 20:08:28 +0000 (16:08 -0400)

committer GitHub <redacted>

Wed, 16 Aug 2023 20:08:28 +0000 (23:08 +0300)
author Shouzheng Liu <redacted>
Wed, 16 Aug 2023 20:08:28 +0000 (16:08 -0400)
committer GitHub <redacted>
Wed, 16 Aug 2023 20:08:28 +0000 (23:08 +0300)
diff --git a/ggml-alloc.c b/ggml-alloc.c

index 4121f3dbab20ff82f7b58bd4a18e71bb4435c3ba..8de28cf9deb05a01995f2420cd1681d5b9bc3c54 100644 (file)
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -67,6 +67,8 @@ struct ggml_allocr {
      struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
      size_t max_size;
      bool measure;
+    int parse_seq[GGML_MAX_NODES];
+    bool has_parse_seq;
  
  #ifdef GGML_ALLOCATOR_DEBUG
      struct ggml_tensor * allocated_tensors[1024];
@@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
      alloc->n_free_blocks++;
  }
  
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+    int pos = 0;
+    for (int i = 0; i < n; i++) {
+        if (list[i] != -1) {
+            alloc->parse_seq[pos] = list[i];
+            pos++;
+        }
+    }
+    alloc->has_parse_seq = true;
+}
+
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
      alloc->n_free_blocks = 1;
      size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
          /*.hash_table    = */ {{0}},
          /*.max_size      = */ 0,
          /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
  #ifdef GGML_ALLOCATOR_DEBUG
          /*.allocated_tensors = */ = {0},
  #endif
@@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
          /*.hash_table    = */ {{0}},
          /*.max_size      = */ 0,
          /*.measure       = */ true,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
  #ifdef GGML_ALLOCATOR_DEBUG
          /*.allocated_tensors = */ = {0},
  #endif
@@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                  allocate_node(alloc, input);
              }
          }
-        for (int i = 0; i < gf->n_nodes; i++) {
+        for (int ind = 0; ind < gf->n_nodes; ind++) {
+            int i;
+            if (alloc->has_parse_seq) {
+                i = alloc->parse_seq[ind];
+            } else {
+                i = ind;
+            }
              struct ggml_tensor * node = gf->nodes[i];
  
              // allocate parents (leafs)
diff --git a/ggml-alloc.h b/ggml-alloc.h

index a5ec8f87a94530840ef7c030a6db9c3e9cdef6e0..14a4350ac2e968f5455632f2b7ad5dba5f4a2d8e 100644 (file)
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -10,6 +10,10 @@ extern "C" {
  GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
  GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
  
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+
  GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
  GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
  GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
diff --git a/ggml-metal.h b/ggml-metal.h

index 16f1a0caacfac483cf2c33e0715ca8d1c61ea7ce..bf3f9a6a8f4a3764ed0e8a8a8926710a7a2608d3 100644 (file)
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
  
  // try to find operations that can be run concurrently in the graph
  // you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
  
-// if the graph has been optimized for concurrently dispatch
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
  
  // same as ggml_graph_compute but uses Metal
  // creates gf->n_threads command buffers in parallel
diff --git a/ggml-metal.m b/ggml-metal.m

index e13cb4b3cd9ba136e37a421e4b98ae23b211186a..32c6e486985ffc95829d39dc9fdafd195f67b08b 100644 (file)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -236,11 +236,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
      ctx->n_cb = n_cb;
  }
  
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
-    if (ctx->concur_list_len) {
-        return true;
-    }
-    return false;
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+    return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+    return ctx->concur_list;
  }
  
  // finds the Metal buffer that contains the tensor data on the GPU device
@@ -383,7 +384,7 @@ void ggml_metal_get_tensor(
  
  void ggml_metal_graph_find_concurrency(
          struct ggml_metal_context * ctx,
-        struct ggml_cgraph * gf) {
+        struct ggml_cgraph * gf, bool check_mem) {
      int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
      int nodes_unused[GGML_MAX_CONCUR];
  
@@ -430,7 +431,7 @@ void ggml_metal_graph_find_concurrency(
                          }
                      }
                  }
-                if (exe_flag) {
+                if (exe_flag && check_mem) {
                      // check if nodes[i]'s data will be overwritten by a node before nodes[i].
                      // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
                      int64_t data_start = (int64_t) gf->nodes[i]->data;
diff --git a/llama.cpp b/llama.cpp

index a161f1566db9ce4f87172c0354a5d354b532e875..3452439904bc0aa93655b5e4986d9b4a2fd532d1 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text,
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
  
  
-#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
+#if !defined(GGML_USE_CUBLAS)
  #include "ggml-alloc.h"
  #define LLAMA_USE_ALLOCATOR
  #else
@@ -1846,10 +1846,6 @@ static bool llama_eval_internal(
  
  #ifdef GGML_USE_METAL
      if (lctx.ctx_metal) {
-        // TODO: disabled until #2413 is resolved
-        //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
-        //    ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
-        //}
          ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
          ggml_metal_graph_compute(lctx.ctx_metal, gf);
          ggml_metal_get_tensor   (lctx.ctx_metal, res);
@@ -3287,7 +3283,18 @@ struct llama_context * llama_new_context_with_model(
              int n_past = hparams.n_ctx - n_tokens;
              llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
              ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
-
+#ifdef GGML_USE_METAL
+            if (params.n_gpu_layers > 0) {
+                ctx->ctx_metal = ggml_metal_init(1);
+                if (!ctx->ctx_metal) {
+                    LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
+                    llama_free(ctx);
+                    return NULL;
+                }
+                ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
+                ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
+            }
+#endif
              // measure memory requirements for the graph
              size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
  
@@ -3305,6 +3312,11 @@ struct llama_context * llama_new_context_with_model(
  
              ctx->buf_alloc.resize(alloc_size);
              ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
+#ifdef GGML_USE_METAL
+            if (ctx->ctx_metal) {
+                ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
+            }
+#endif
          }
  #else
          ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3319,13 +3331,6 @@ struct llama_context * llama_new_context_with_model(
  #ifdef GGML_USE_METAL
      if (params.n_gpu_layers > 0) {
          // this allocates all Metal resources and memory buffers
-        ctx->ctx_metal = ggml_metal_init(1);
-
-        if (!ctx->ctx_metal) {
-            LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
-            llama_free(ctx);
-            return NULL;
-        }
  
          void * data_ptr  = NULL;
          size_t data_size = 0;
@@ -3354,8 +3359,7 @@ struct llama_context * llama_new_context_with_model(
          LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
          LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
  
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
+        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
  #undef LLAMA_METAL_CHECK_BUF
      }
  #endif
author	Shouzheng Liu <redacted>
	Wed, 16 Aug 2023 20:08:28 +0000 (16:08 -0400)
committer	GitHub <redacted>
	Wed, 16 Aug 2023 20:08:28 +0000 (23:08 +0300)
ggml-alloc.c		patch \| blob \| history
ggml-alloc.h		patch \| blob \| history
ggml-metal.h		patch \| blob \| history
ggml-metal.m		patch \| blob \| history
llama.cpp		patch \| blob \| history