sync : whisper (POSIX) (#511)

author Georgi Gerganov <redacted>

Fri, 8 Sep 2023 14:57:04 +0000 (17:57 +0300)

committer GitHub <redacted>

Fri, 8 Sep 2023 14:57:04 +0000 (17:57 +0300)
author Georgi Gerganov <redacted>
Fri, 8 Sep 2023 14:57:04 +0000 (17:57 +0300)
committer GitHub <redacted>
Fri, 8 Sep 2023 14:57:04 +0000 (17:57 +0300)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index d31be31659e72053e0a57b1977b2c5e06a612776..bf1491bf31ea931a2ea0dc37ea2078c50c541bf8 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,6 +91,57 @@ if (NOT MSVC)
      )
  endif()
  
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+add_compile_definitions(_XOPEN_SOURCE=600)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    remove_definitions(-D_XOPEN_SOURCE=600)
+    add_compile_definitions(_XOPEN_SOURCE=700)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity
+# are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WHISPER_PERF)
+    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
+endif()
+
  # dependencies
  
  set(CMAKE_C_STANDARD   11)
diff --git a/examples/common.cpp b/examples/common.cpp

index 2b8da8f734219bc72f7ac0921d6c97dc8f1c7cd7..11064b8d339a95dcd112ff5b2ab2fcb3d9aff272 100644 (file)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
      return true;
  }
  
-void sam_print_usage(int argc, char ** argv, const sam_params & params) {
+void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
      fprintf(stderr, "usage: %s [options]\n", argv[0]);
      fprintf(stderr, "\n");
      fprintf(stderr, "options:\n");
diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp

index fa399c6d78114a9d886e626b3c3bf4b48a68451a..60c1cca756a683de1321e72a38838957e464b629 100644 (file)
--- a/examples/whisper/main.cpp
+++ b/examples/whisper/main.cpp
@@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
  
      return speaker;
  }
-void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
+void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
      int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
      int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
      if (progress >= *progress_prev + progress_step) {
@@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
      return true;
  }
  
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
      std::ofstream fout(fname);
      fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
  
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c

index c1939a4b7817b63a173c2b453729592c1457327c..a1f6e7bf4f66eec4d9437c86624cb85d7d72bc39 100644 (file)
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -1,8 +1,3 @@
-// defines MAP_ANONYMOUS
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
  #include "ggml-alloc.h"
  #include "ggml.h"
  #include <assert.h>
@@ -138,7 +133,7 @@ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_ten
  
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
  #ifdef GGML_ALLOCATOR_DEBUG
-    GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
+    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
      GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
  #endif
      size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
@@ -165,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
      if (best_fit_block == -1) {
          // the last block is our last resort
          struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
          if (block->size >= size) {
              best_fit_block = alloc->n_free_blocks - 1;
-            max_avail = MAX(max_avail, block->size);
          } else {
              fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                      __func__, size, max_avail);
              GGML_ASSERT(!"not enough space in the buffer");
-        return;
+            return;
          }
      }
      struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -316,7 +311,11 @@ static void * alloc_vmem(size_t size) {
  #if defined(_WIN32)
      return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
  #elif defined(_POSIX_MAPPED_FILES)
-    return mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+    return ptr;
  #else
      // use a fixed address for other platforms
      uintptr_t base_addr = (uintptr_t)-size - 0x100;
diff --git a/src/ggml-metal.m b/src/ggml-metal.m

index d0d23442eab5d83b74768d0b74bef674861d5e06..7e2355ce6bcc7eeb8abe7f5290cad19e9629cb35 100644 (file)
--- a/src/ggml-metal.m
+++ b/src/ggml-metal.m
@@ -327,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
  
  void * ggml_metal_host_malloc(size_t n) {
      void * data = NULL;
-    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
      if (result != 0) {
          metal_printf("%s: error: posix_memalign failed\n", __func__);
          return NULL;
@@ -401,7 +401,7 @@ bool ggml_metal_add_buffer(
              }
          }
  
-        const size_t size_page = getpagesize();
+        const size_t size_page = sysconf(_SC_PAGESIZE);
  
          size_t size_aligned = size;
          if ((size_aligned % size_page) != 0) {
@@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
                              [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                              [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
  
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                          } break;
                      case GGML_OP_DUP:
                      case GGML_OP_CPY:
diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal

index 119fcbeb623c11ca25a1c59c08564d7c3da32fba..5070561fba1ace47d5b224d13ec8e9a0a7771773 100644 (file)
--- a/src/ggml-metal.metal
+++ b/src/ggml-metal.metal
@@ -220,14 +220,10 @@ kernel void kernel_norm(
          }
          threadgroup_barrier(mem_flags::mem_threadgroup);
      }
-    //// broadcast
-    //if (tpitg == 0) {
-    //    sum[0] /= ne00;
-    //}
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float mean  = sum[0];
+    const float mean  = sum[0] / ne00;
  
      // recenter and VARIANCE
+    threadgroup_barrier(mem_flags::mem_threadgroup);
      device float * y = dst + tgpig*ne00;
      sum[tpitg] = 0.0f;
      for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -235,12 +231,6 @@ kernel void kernel_norm(
          sum[tpitg] += y[i00] * y[i00];
      }
  
-    //// VARIANCE
-    //// parallel sum
-    //sum[tpitg] = 0.0f;
-    //for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-    //    sum[tpitg] += y[i00] * y[i00];
-    //}
      // reduce
      threadgroup_barrier(mem_flags::mem_threadgroup);
      for (uint i = ntg/2; i > 0; i /= 2) {
@@ -249,12 +239,7 @@ kernel void kernel_norm(
          }
          threadgroup_barrier(mem_flags::mem_threadgroup);
      }
-    //// broadcast
-    //if (tpitg == 0) {
-    //    sum[0] /= ne00;
-    //}
-    //threadgroup_barrier(mem_flags::mem_threadgroup);
-    const float variance = sum[0];
+    const float variance = sum[0] / ne00;
  
      const float scale = 1.0f/sqrt(variance + eps);
      for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@@ -262,7 +247,6 @@ kernel void kernel_norm(
      }
  }
  
-
  kernel void kernel_rms_norm(
          device const  void * src0,
          device       float * dst,
@@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
              }
          }
      }
-
  }
  
  kernel void kernel_alibi_f32(
@@ -699,25 +682,27 @@ kernel void kernel_rope(
          constant       int & mode,
          constant     float & freq_base,
          constant     float & freq_scale,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i3 = tpig[2];
-    const int64_t i2 = tpig[1];
-    const int64_t i1 = tpig[0];
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
+    const int64_t i3 = tgpig[2];
+    const int64_t i2 = tgpig[1];
+    const int64_t i1 = tgpig[0];
  
      const bool is_neox = mode & 2;
-    const float theta_scale = pow(freq_base, -2.0f/n_dims);
  
      const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
  
-    float theta = freq_scale * (float)p;
+    const float theta_0 = freq_scale * (float)p;
+    const float inv_ndims = -1.f/n_dims;
  
      if (!is_neox) {
-        for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+
+            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
              const float cos_theta = cos(theta);
              const float sin_theta = sin(theta);
  
-            theta *= theta_scale;
-
              device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
              device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
  
@@ -729,12 +714,12 @@ kernel void kernel_rope(
          }
      } else {
          for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-            for (int64_t ic = 0; ic < n_dims; ic += 2) {
+            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+
+                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
                  const float cos_theta = cos(theta);
                  const float sin_theta = sin(theta);
  
-                theta *= theta_scale;
-
                  const int64_t i0 = ib*n_dims + ic/2;
  
                  device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
diff --git a/src/ggml.c b/src/ggml.c

index 63a22223a4a003699527f813a617d3328bef9c17..3f72379c3553e27bc3f685f8756f163a4aa5f860 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -1,4 +1,3 @@
-#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
  #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
  
  #include "ggml.h"
@@ -107,6 +106,9 @@ typedef void * thread_ret_t;
  #include <sys/stat.h>
  #include <unistd.h>
  
+#endif
+#ifdef GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
  #endif
  
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -196,9 +198,15 @@ typedef void * thread_ret_t;
  #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
  #else
  inline static void * ggml_aligned_malloc(size_t size) {
+    if (size == 0) {
+        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
+        return NULL;
+    }
      void * aligned_memory = NULL;
-#ifdef GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, getpagesize(), size);
+#ifdef GGML_USE_CPU_HBM
+    int result = hbw_posix_memalign(&aligned_memory, 16, size);
+#elif GGML_USE_METAL
+    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
  #else
      int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
  #endif
@@ -219,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
      return aligned_memory;
  }
  #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
+#ifdef GGML_USE_CPU_HBM
+#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
+#else
  #define GGML_ALIGNED_FREE(ptr)    free(ptr)
  #endif
+#endif
  
  #define UNUSED GGML_UNUSED
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -4572,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
          return NULL;
      }
  
+    // allow to call ggml_init with 0 size
+    if (params.mem_size == 0) {
+        params.mem_size = GGML_MEM_ALIGN;
+    }
+
      const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
  
      *ctx = (struct ggml_context) {
@@ -4774,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
  
      size_t obj_alloc_size = 0;
  
-    if (view_src == NULL && ctx->no_alloc == false) {
+    if (view_src == NULL && !ctx->no_alloc) {
          if (ctx->scratch.data != NULL) {
              // allocate tensor data in the scratch buffer
              if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5475,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
      }
  
      if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
      }
  
      struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5518,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
      }
  
      if (inplace) {
-        GGML_ASSERT(is_node == false);
+        GGML_ASSERT(!is_node);
      }
  
      struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -19962,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
  
          struct ggml_tensor * data = NULL;
  
-        if (params.no_alloc == false) {
+        if (!params.no_alloc) {
              data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
  
              ok = ok && data != NULL;
@@ -20003,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
              }
  
              // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (params.no_alloc == false) {
+            if (!params.no_alloc) {
                //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                  cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
              }
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp

index 0bb9537f693ed22ba58bb642d7f124cd2373e8c8..cbea7d4525ca4c7479c48d73fc936aa030a2b537 100644 (file)
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -76,7 +76,7 @@ void * align_with_offset(void * ptr, int offset) {
      return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
  }
  
-void benchmark_function(size_t size, size_t q_size, int64_t iterations, std::function<size_t(void)> function) {
+void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
      int64_t min_time_us = INT64_MAX;
      int64_t total_time_us = 0;
      int64_t min_time_cycles = INT64_MAX;
author	Georgi Gerganov <redacted>
	Fri, 8 Sep 2023 14:57:04 +0000 (17:57 +0300)
committer	GitHub <redacted>
	Fri, 8 Sep 2023 14:57:04 +0000 (17:57 +0300)
CMakeLists.txt		patch \| blob \| history
examples/common.cpp		patch \| blob \| history
examples/whisper/main.cpp		patch \| blob \| history
src/ggml-alloc.c		patch \| blob \| history
src/ggml-metal.m		patch \| blob \| history
src/ggml-metal.metal		patch \| blob \| history
src/ggml.c		patch \| blob \| history
tests/test-quantize-perf.cpp		patch \| blob \| history