sync : ggml (HBM + Metal + style) (#1264)

author Georgi Gerganov <redacted>

Fri, 8 Sep 2023 14:58:31 +0000 (17:58 +0300)

committer GitHub <redacted>

Fri, 8 Sep 2023 14:58:31 +0000 (17:58 +0300)
author Georgi Gerganov <redacted>
Fri, 8 Sep 2023 14:58:31 +0000 (17:58 +0300)
committer GitHub <redacted>
Fri, 8 Sep 2023 14:58:31 +0000 (17:58 +0300)
diff --git a/ggml-metal.m b/ggml-metal.m

index 521ca180f085be10a3972d3a2e3ea386556a1bdb..7e2355ce6bcc7eeb8abe7f5290cad19e9629cb35 100644 (file)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
                              [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                              [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
  
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                          } break;
                      case GGML_OP_DUP:
                      case GGML_OP_CPY:
diff --git a/ggml-metal.metal b/ggml-metal.metal

index 119fcbeb623c11ca25a1c59c08564d7c3da32fba..5070561fba1ace47d5b224d13ec8e9a0a7771773 100644 (file)
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -220,14 +220,10 @@ kernel void kernel_norm(
          }
          threadgroup_barrier(mem_flags::mem_threadgroup);
      }
-    //// broadcast
-    //if (tpitg == 0) {
-    //    sum[0] /= ne00;
-    //}
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float mean  = sum[0];
+    const float mean  = sum[0] / ne00;
  
      // recenter and VARIANCE
+    threadgroup_barrier(mem_flags::mem_threadgroup);
      device float * y = dst + tgpig*ne00;
      sum[tpitg] = 0.0f;
      for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -235,12 +231,6 @@ kernel void kernel_norm(
          sum[tpitg] += y[i00] * y[i00];
      }
  
-    //// VARIANCE
-    //// parallel sum
-    //sum[tpitg] = 0.0f;
-    //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-    //    sum[tpitg] += y[i00] * y[i00];
-    //}
      // reduce
      threadgroup_barrier(mem_flags::mem_threadgroup);
      for (uint i = ntg/2; i > 0; i /= 2) {
@@ -249,12 +239,7 @@ kernel void kernel_norm(
          }
          threadgroup_barrier(mem_flags::mem_threadgroup);
      }
-    //// broadcast
-    //if (tpitg == 0) {
-    //    sum[0] /= ne00;
-    //}
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float variance = sum[0];
+    const float variance = sum[0] / ne00;
  
      const float scale = 1.0f/sqrt(variance + eps);
      for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -262,7 +247,6 @@ kernel void kernel_norm(
      }
  }
  
-
  kernel void kernel_rms_norm(
          device const  void * src0,
          device       float * dst,
@@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
              }
          }
      }
-
  }
  
  kernel void kernel_alibi_f32(
@@ -699,25 +682,27 @@ kernel void kernel_rope(
          constant       int & mode,
          constant     float & freq_base,
          constant     float & freq_scale,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i3 = tpig[2];
-    const int64_t i2 = tpig[1];
-    const int64_t i1 = tpig[0];
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
+    const int64_t i3 = tgpig[2];
+    const int64_t i2 = tgpig[1];
+    const int64_t i1 = tgpig[0];
  
      const bool is_neox = mode & 2;
-    const float theta_scale = pow(freq_base, -2.0f/n_dims);
  
      const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
  
-    float theta = freq_scale * (float)p;
+    const float theta_0 = freq_scale * (float)p;
+    const float inv_ndims = -1.f/n_dims;
  
      if (!is_neox) {
-        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+
+            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
              const float cos_theta = cos(theta);
              const float sin_theta = sin(theta);
  
-            theta *= theta_scale;
-
              device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
              device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
  
@@ -729,12 +714,12 @@ kernel void kernel_rope(
          }
      } else {
          for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
                  const float cos_theta = cos(theta);
                  const float sin_theta = sin(theta);
  
-                theta *= theta_scale;
-
                  const int64_t i0 = ib*n_dims + ic/2;
  
                  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
diff --git a/ggml.c b/ggml.c

index 0fe07b245ba98b137e5d3aa54cbc3010a1a3a7ba..3f72379c3553e27bc3f685f8756f163a4aa5f860 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -106,6 +106,9 @@ typedef void * thread_ret_t;
  #include <sys/stat.h>
  #include <unistd.h>
  
+#endif
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
  #endif
  
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -195,8 +198,14 @@ typedef void * thread_ret_t;
  #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
  #else
  inline static void * ggml_aligned_malloc(size_t size) {
+    if (size == 0) {
+        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+        return NULL;
+    }
      void * aligned_memory = NULL;
-#ifdef GGML_USE_METAL
+#ifdef GGML_USE_CPU_HBM
+    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
      int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
  #else
      int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
@@ -218,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
      return aligned_memory;
  }
  #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+#else
  #define GGML_ALIGNED_FREE(ptr)    free(ptr)
  #endif
+#endif
  
  #define UNUSED GGML_UNUSED
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -4571,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
          return NULL;
      }
  
+    // allow to call ggml_init with 0 size
+    if (params.mem_size == 0) {
+        params.mem_size = GGML_MEM_ALIGN;
+    }
+
      const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
  
      *ctx = (struct ggml_context) {
@@ -4773,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
  
      size_t obj_alloc_size = 0;
  
-    if (view_src == NULL && ctx->no_alloc == false) {
+    if (view_src == NULL && !ctx->no_alloc) {
          if (ctx->scratch.data != NULL) {
              // allocate tensor data in the scratch buffer
              if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5474,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
      }
  
      if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
      }
  
      struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5517,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
      }
  
      if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
      }
  
      struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -19961,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
  
          struct ggml_tensor * data = NULL;
  
-        if (params.no_alloc == false) {
+        if (!params.no_alloc) {
              data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
  
              ok = ok && data != NULL;
@@ -20002,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
              }
  
              // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (params.no_alloc == false) {
+            if (!params.no_alloc) {
                //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                  cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
              }
author	Georgi Gerganov <redacted>
	Fri, 8 Sep 2023 14:58:31 +0000 (17:58 +0300)
committer	GitHub <redacted>
	Fri, 8 Sep 2023 14:58:31 +0000 (17:58 +0300)
ggml-metal.m		patch \| blob \| history
ggml-metal.metal		patch \| blob \| history
ggml.c		patch \| blob \| history