metal : fix memory leak (#2762)

author Georgi Gerganov <redacted>

Mon, 28 Aug 2023 07:59:08 +0000 (10:59 +0300)

committer GitHub <redacted>

Mon, 28 Aug 2023 07:59:08 +0000 (10:59 +0300)
author Georgi Gerganov <redacted>
Mon, 28 Aug 2023 07:59:08 +0000 (10:59 +0300)
committer GitHub <redacted>
Mon, 28 Aug 2023 07:59:08 +0000 (10:59 +0300)
diff --git a/ggml-metal.h b/ggml-metal.h

index 00202b787c8043173b8308f49e6ae61080d42094..fca28d37ef97069ca7800fc890ea4d013c46437f 100644 (file)
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -24,6 +24,7 @@
  
  // max memory buffers that can be mapped to the device
  #define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_COMMAND_BUFFERS 32
  
  struct ggml_tensor;
  struct ggml_cgraph;
diff --git a/ggml-metal.m b/ggml-metal.m

index 06eb3872e25e400efa07209bbac4253ea7a879ce..ad2ee8cf5fef098169d765b7fa92b12f57c9f70c 100644 (file)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -33,12 +33,15 @@ struct ggml_metal_buffer {
  struct ggml_metal_context {
      int n_cb;
  
-    float * logits;
-
      id<MTLDevice>       device;
      id<MTLCommandQueue> queue;
      id<MTLLibrary>      library;
  
+    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
+    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
+
+    dispatch_queue_t d_queue;
+
      int n_buffers;
      struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
  
@@ -114,12 +117,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
  
      struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
  
-    ctx->n_cb   = n_cb;
+    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
      ctx->device = MTLCreateSystemDefaultDevice();
      ctx->queue  = [ctx->device newCommandQueue];
      ctx->n_buffers = 0;
      ctx->concur_list_len = 0;
  
+    ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
  
  #if 0
      // compile from source string and show compile log
@@ -239,9 +243,67 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
  
  void ggml_metal_free(struct ggml_metal_context * ctx) {
      fprintf(stderr, "%s: deallocating\n", __func__);
+#define GGML_METAL_DEL_KERNEL(name) \
+    [ctx->function_##name release]; \
+    [ctx->pipeline_##name release];
+
+    GGML_METAL_DEL_KERNEL(add);
+    GGML_METAL_DEL_KERNEL(add_row);
+    GGML_METAL_DEL_KERNEL(mul);
+    GGML_METAL_DEL_KERNEL(mul_row);
+    GGML_METAL_DEL_KERNEL(scale);
+    GGML_METAL_DEL_KERNEL(silu);
+    GGML_METAL_DEL_KERNEL(relu);
+    GGML_METAL_DEL_KERNEL(gelu);
+    GGML_METAL_DEL_KERNEL(soft_max);
+    GGML_METAL_DEL_KERNEL(diag_mask_inf);
+    GGML_METAL_DEL_KERNEL(get_rows_f16);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DEL_KERNEL(rms_norm);
+    GGML_METAL_DEL_KERNEL(norm);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(rope);
+    GGML_METAL_DEL_KERNEL(alibi_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
+
+#undef GGML_METAL_DEL_KERNEL
+
      for (int i = 0; i < ctx->n_buffers; ++i) {
          [ctx->buffers[i].metal release];
      }
+
+    [ctx->library release];
+    [ctx->queue release];
+    [ctx->device release];
+
+    dispatch_release(ctx->d_queue);
+
      free(ctx);
  }
  
@@ -261,7 +323,7 @@ void ggml_metal_host_free(void * data) {
  }
  
  void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
-    ctx->n_cb = n_cb;
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
  }
  
  int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
@@ -507,6 +569,8 @@ void ggml_metal_graph_compute(
                 struct ggml_cgraph * gf) {
      metal_printf("%s: evaluating graph\n", __func__);
  
+    @autoreleasepool {
+
      // if there is ctx->concur_list, dispatch concurrently
      // else fallback to serial dispatch
      MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
@@ -521,29 +585,25 @@ void ggml_metal_graph_compute(
  
      const int n_cb = ctx->n_cb;
  
-    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
-
      for (int i = 0; i < n_cb; ++i) {
-        command_buffers[i] = [ctx->queue commandBuffer];
+        ctx->command_buffers[i] = [ctx->queue commandBuffer];
  
          // enqueue the command buffers in order to specify their execution order
-        [command_buffers[i] enqueue];
-    }
+        [ctx->command_buffers[i] enqueue];
  
-    // TODO: is this the best way to start threads?
-    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
+        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
+    }
  
      for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
          const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
  
-        dispatch_async(queue, ^{
+        dispatch_async(ctx->d_queue, ^{
              size_t offs_src0 = 0;
              size_t offs_src1 = 0;
              size_t offs_dst  = 0;
  
-            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
-
-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
+            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
  
              const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
              const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
@@ -1117,17 +1177,19 @@ void ggml_metal_graph_compute(
      }
  
      // wait for all threads to finish
-    dispatch_barrier_sync(queue, ^{});
-
-    [command_buffers[n_cb - 1] waitUntilCompleted];
+    dispatch_barrier_sync(ctx->d_queue, ^{});
  
      // check status of command buffers
      // needed to detect if the device ran out-of-memory for example (#1881)
      for (int i = 0; i < n_cb; i++) {
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
+        [ctx->command_buffers[i] waitUntilCompleted];
+
+        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
          if (status != MTLCommandBufferStatusCompleted) {
              fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
              GGML_ASSERT(false);
          }
      }
+
+    }
  }
diff --git a/ggml.c b/ggml.c

index 767c19ae2b58ad10643f80a07d61aa396ea67c46..54f426bc066b6b10898ecfe33874e50f8e94f8e0 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -2436,7 +2436,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
      const int nb = n / qk;
  
      assert(n % qk == 0);
-    assert(nb % 2 == 0);
  
      const block_q4_0 * restrict x = vx;
      const block_q8_0 * restrict y = vy;
@@ -2445,6 +2444,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
      float32x4_t sumv0 = vdupq_n_f32(0.0f);
      float32x4_t sumv1 = vdupq_n_f32(0.0f);
  
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
      for (int i = 0; i < nb; i += 2) {
          const block_q4_0 * restrict x0 = &x[i + 0];
          const block_q4_0 * restrict x1 = &x[i + 1];
@@ -2623,6 +2623,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
      }
  
      // Main loop
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
      for (int i = 2; i < nb; i+=2) {
          _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
          _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
@@ -2706,7 +2707,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
      const int nb = n / qk;
  
      assert(n % qk == 0);
-    assert(nb % 2 == 0);
  
      const block_q4_1 * restrict x = vx;
      const block_q8_1 * restrict y = vy;
@@ -2718,6 +2718,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
  
      float summs = 0;
  
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
      for (int i = 0; i < nb; i += 2) {
          const block_q4_1 * restrict x0 = &x[i + 0];
          const block_q4_1 * restrict x1 = &x[i + 1];
@@ -2832,7 +2833,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
      const int nb = n / qk;
  
      assert(n % qk == 0);
-    assert(nb % 2 == 0);
      assert(qk == QK5_0);
  
      const block_q5_0 * restrict x = vx;
@@ -2848,6 +2848,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
      uint64_t tmp0[4];
      uint64_t tmp1[4];
  
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
      for (int i = 0; i < nb; i += 2) {
          const block_q5_0 * restrict x0 = &x[i];
          const block_q5_0 * restrict x1 = &x[i + 1];
@@ -3072,7 +3073,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
      const int nb = n / qk;
  
      assert(n % qk == 0);
-    assert(nb % 2 == 0);
      assert(qk == QK5_1);
  
      const block_q5_1 * restrict x = vx;
@@ -3091,6 +3091,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
      uint64_t tmp0[4];
      uint64_t tmp1[4];
  
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
      for (int i = 0; i < nb; i += 2) {
          const block_q5_1 * restrict x0 = &x[i];
          const block_q5_1 * restrict x1 = &x[i + 1];
@@ -3328,7 +3329,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
      const int nb = n / qk;
  
      assert(n % qk == 0);
-    assert(nb % 2 == 0);
  
      const block_q8_0 * restrict x = vx;
      const block_q8_0 * restrict y = vy;
@@ -3337,6 +3337,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
      float32x4_t sumv0 = vdupq_n_f32(0.0f);
      float32x4_t sumv1 = vdupq_n_f32(0.0f);
  
+    GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb
      for (int i = 0; i < nb; i += 2) {
          const block_q8_0 * restrict x0 = &x[i + 0];
          const block_q8_0 * restrict x1 = &x[i + 1];
author	Georgi Gerganov <redacted>
	Mon, 28 Aug 2023 07:59:08 +0000 (10:59 +0300)
committer	GitHub <redacted>
	Mon, 28 Aug 2023 07:59:08 +0000 (10:59 +0300)
ggml-metal.h		patch \| blob \| history
ggml-metal.m		patch \| blob \| history
ggml.c		patch \| blob \| history