sync : llama.cpp (cuda, gguf and linker fixes)

author Georgi Gerganov <redacted>

Thu, 16 Nov 2023 15:06:55 +0000 (17:06 +0200)

committer Georgi Gerganov <redacted>

Thu, 16 Nov 2023 15:06:55 +0000 (17:06 +0200)
author Georgi Gerganov <redacted>
Thu, 16 Nov 2023 15:06:55 +0000 (17:06 +0200)
committer Georgi Gerganov <redacted>
Thu, 16 Nov 2023 15:06:55 +0000 (17:06 +0200)
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu

index 7be63925f4edadfb9433a9f73921a8544d378309..c0c9edd56dbc232b060afd62304d04bf4df45be3 100644 (file)
--- a/src/ggml-cuda.cu
+++ b/src/ggml-cuda.cu
@@ -88,6 +88,8 @@
  #define CC_OFFSET_AMD 1000000
  #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
  
+#define GGML_CUDA_MAX_NODES 8192
+
  // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
  // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
  // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@@ -7727,7 +7729,7 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
      ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
  }
  
-void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
      ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
  }
  
@@ -7842,11 +7844,11 @@ static size_t g_temp_tensor_extra_index = 0;
  
  static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
      if (g_temp_tensor_extras == nullptr) {
-        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
      }
  
      size_t alloc_index = g_temp_tensor_extra_index;
-    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
      ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
      memset(extra, 0, sizeof(*extra));
  
@@ -8173,11 +8175,11 @@ struct ggml_backend_buffer_context_cuda {
  
      ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
          if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
          }
  
          size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
          ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
          memset(extra, 0, sizeof(*extra));
  
diff --git a/src/ggml-quants.c b/src/ggml-quants.c

index a48eda7320c46d2b39f0d9ed76e14aeee3a2461d..cf2860b8cbd5924c3da459a5feb78541dd120323 100644 (file)
--- a/src/ggml-quants.c
+++ b/src/ggml-quants.c
@@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
      float max = x[0];
      float sum_w = weights[0];
      float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
      for (int i = 1; i < n; ++i) {
+#endif
          if (x[i] < min) min = x[i];
          if (x[i] > max) max = x[i];
          float w = weights[i];
diff --git a/src/ggml.c b/src/ggml.c

index 3202a517b78686da7ab1721781899cd09af05356..ada1067da56d45ba5111311ad390671b940c2815 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -18073,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      {
          ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
  
-        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
              struct gguf_kv * kv = &ctx->kv[i];
  
              //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18120,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                              case GGUF_TYPE_STRING:
                                  {
                                      kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
-                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                          ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                      }
                                  } break;
@@ -18148,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      {
          ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
  
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
              struct gguf_tensor_info * info = &ctx->infos[i];
  
              for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18195,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      // compute the total size of the data section, taking into account the alignment
      {
          ctx->size = 0;
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
              struct gguf_tensor_info * info = &ctx->infos[i];
  
              const int64_t ne =
@@ -18264,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
          ggml_set_no_alloc(ctx_data, true);
  
          // create the tensors
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
              const int64_t ne[GGML_MAX_DIMS] = {
                  ctx->infos[i].ne[0],
                  ctx->infos[i].ne[1],
author	Georgi Gerganov <redacted>
	Thu, 16 Nov 2023 15:06:55 +0000 (17:06 +0200)
committer	Georgi Gerganov <redacted>
	Thu, 16 Nov 2023 15:06:55 +0000 (17:06 +0200)
src/ggml-cuda.cu		patch \| blob \| history
src/ggml-quants.c		patch \| blob \| history
src/ggml.c		patch \| blob \| history