sync : ggml (ggml-alloc + linker + gguf fixes) (#1501)

author Georgi Gerganov <redacted>

Fri, 17 Nov 2023 08:00:07 +0000 (10:00 +0200)

committer GitHub <redacted>

Fri, 17 Nov 2023 08:00:07 +0000 (10:00 +0200)
author Georgi Gerganov <redacted>
Fri, 17 Nov 2023 08:00:07 +0000 (10:00 +0200)
committer GitHub <redacted>
Fri, 17 Nov 2023 08:00:07 +0000 (10:00 +0200)
diff --git a/ggml-alloc.c b/ggml-alloc.c

index f400dc56a20d054ee7d8879447bec42c2732d59e..cdfe4caf69613d6778f8a80af9b0d6ffc8e67652 100644 (file)
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
      return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
  }
  
-static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) {
+static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
      ggml_tallocr_t alloc = node_tallocr(galloc, view);
  
      //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
      GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
-    view->backend = view->view_src->backend;
+    if (update_backend) {
+        view->backend = view->view_src->backend;
+    }
      view->buffer  = view->view_src->buffer;
      view->data    = (char *)view->view_src->data + view->view_offs;
  
@@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
  
      if (node->data == NULL) {
          if (ggml_is_view(node)) {
-            init_view(galloc, node);
+            init_view(galloc, node, true);
          } else {
              // see if we can reuse a parent's buffer (inplace)
              if (ggml_op_can_inplace(node->op)) {
@@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
                                  AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                  node->view_src = view_src;
                                  view_src_hn->n_views += 1;
-                                init_view(galloc, node);
+                                init_view(galloc, node, false);
                                  return;
                              }
-                        }
-                        else {
+                        } else {
                              AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                              node->view_src = parent;
                              p_hn->n_views += 1;
-                            init_view(galloc, node);
+                            init_view(galloc, node, false);
                              return;
                          }
                      }
@@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
              hash_get(galloc, view_src)->n_views += 1;
              if (node->buffer == NULL && node->data != NULL) {
                  // view of a pre-allocated tensor, didn't call init_view() yet
-                init_view(galloc, node);
+                init_view(galloc, node, true);
              }
          }
  
@@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
              }
              hash_get(galloc, parent)->n_children += 1;
              if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                init_view(galloc, parent);
+                init_view(galloc, parent, true);
              }
          }
     }
@@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
      return max_size;
  }
  
-void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) {
+void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
      const size_t hash_size = hash_set.size;
  
      GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
@@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
      // reset hash values
      memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
  
-    galloc->hash_allocs = hash_node_alloct;
+    galloc->hash_allocs = hash_node_talloc;
  
      ggml_tallocr_alloc_graph_impl(galloc, graph);
  
diff --git a/ggml-quants.c b/ggml-quants.c

index a48eda7320c46d2b39f0d9ed76e14aeee3a2461d..cf2860b8cbd5924c3da459a5feb78541dd120323 100644 (file)
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
      float max = x[0];
      float sum_w = weights[0];
      float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
      for (int i = 1; i < n; ++i) {
+#endif
          if (x[i] < min) min = x[i];
          if (x[i] > max) max = x[i];
          float w = weights[i];
diff --git a/ggml.c b/ggml.c

index 584ee4680378d9ddb3722d61e618d9837a07a88d..ada1067da56d45ba5111311ad390671b940c2815 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
          int                   n_dims,
          int                   mode,
          int                   n_ctx,
+        int                   n_orig_ctx,
          float                 freq_base,
          float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
          float                 xpos_base,
          bool                  xpos_down) {
      GGML_ASSERT(ggml_is_vector(b));
@@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
  
      struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  
-    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
-    memcpy(params + 6, &xpos_base,  sizeof(float));
-    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
      ggml_set_op_params(result, params, sizeof(params));
  
      result->op   = GGML_OP_ROPE_BACK;
@@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
  }
  #endif
  
-
  static void ggml_compute_forward_mul_mat(
          const struct ggml_compute_params * params,
          const struct ggml_tensor * src0,
@@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
          const struct ggml_compute_params * params,
          const struct ggml_tensor * src0,
          const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const bool forward) {
      if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
          return;
      }
@@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
      const bool is_neox = mode & 2;
      const bool is_glm  = mode & 4;
  
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
      const int32_t * pos = (const int32_t *) src1->data;
  
      for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
                      float block_theta = MAX(p - (n_ctx - 2), 0);
                      for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                          const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                          const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
  
                          theta_base *= theta_scale;
                          block_theta *= theta_scale;
@@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
                          rope_yarn(
                              theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                          );
+                        sin_theta *= sin_sign;
  
                          // zeta scaling for xPos only:
                          float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
                                  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                  &cos_theta, &sin_theta
                              );
+                            sin_theta *= sin_sign;
  
                              theta_base *= theta_scale;
  
@@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
          const struct ggml_compute_params * params,
          const struct ggml_tensor * src0,
          const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const bool forward) {
      if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
          return;
      }
@@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
      const bool is_neox = mode & 2;
      const bool is_glm  = mode & 4;
  
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
      const int32_t * pos = (const int32_t *) src1->data;
  
      for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
                      float block_theta = MAX(p - (n_ctx - 2), 0);
                      for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                          const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                          const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
  
                          theta_base *= theta_scale;
                          block_theta *= theta_scale;
@@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
                          rope_yarn(
                              theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                          );
+                        sin_theta *= sin_sign;
  
                          theta_base *= theta_scale;
  
@@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
                                  theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                  &cos_theta, &sin_theta
                              );
+                            sin_theta *= sin_sign;
  
                              theta_base *= theta_scale;
  
@@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope(
      switch (src0->type) {
          case GGML_TYPE_F16:
              {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
              } break;
          case GGML_TYPE_F32:
              {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
              } break;
          default:
              {
@@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope(
  
  // ggml_compute_forward_rope_back
  
-static void ggml_compute_forward_rope_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-
-    float freq_base;
-    float freq_scale;
-
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool xpos_down;
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = freq_scale * (float)p;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
-
-                        theta_base *= theta_scale;
-
-                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float dy0 = dy[0];
-                        const float dy1 = dy[1];
-
-                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
-                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float dy0 = dy[0];
-                            const float dy1 = dy[n_dims/2];
-
-                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
-                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_back_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
-                        theta_base *= theta_scale;
-
-                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
-                        const float dy1 = GGML_FP16_TO_FP32(dy[1]);
-
-                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
-                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
-
-                            dx[0]        = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                            dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
  static void ggml_compute_forward_rope_back(
          const struct ggml_compute_params * params,
          const struct ggml_tensor * src0,
@@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back(
      switch (src0->type) {
          case GGML_TYPE_F16:
              {
-                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
              } break;
          case GGML_TYPE_F32:
              {
-                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
              } break;
          default:
              {
@@ -14923,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                  // necessary for llama
                  if (src0->grad) {
                      //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
  
                      src0->grad = ggml_add_or_set(ctx,
                              src0->grad,
@@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                  n_dims,
                                  mode,
                                  n_ctx,
+                                n_orig_ctx,
                                  freq_base,
                                  freq_scale,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                  xpos_base,
                                  xpos_down),
                              zero_table);
@@ -14954,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
              {
                  if (src0->grad) {
                      //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
  
                      src0->grad = ggml_add_or_set(ctx,
                              src0->grad,
@@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                  src1,
                                  n_dims,
                                  mode,
-                                0,
                                  n_ctx,
+                                n_orig_ctx,
                                  freq_base,
                                  freq_scale,
-                                0.0f,
-                                1.0f,
-                                0.0f,
-                                0.0f,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                  xpos_base,
                                  xpos_down,
                                  false),
@@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      {
          ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
  
-        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
              struct gguf_kv * kv = &ctx->kv[i];
  
              //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                              case GGUF_TYPE_STRING:
                                  {
                                      kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
-                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                          ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                      }
                                  } break;
@@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      {
          ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
  
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
              struct gguf_tensor_info * info = &ctx->infos[i];
  
              for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
      // compute the total size of the data section, taking into account the alignment
      {
          ctx->size = 0;
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
              struct gguf_tensor_info * info = &ctx->infos[i];
  
              const int64_t ne =
@@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
          ggml_set_no_alloc(ctx_data, true);
  
          // create the tensors
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
              const int64_t ne[GGML_MAX_DIMS] = {
                  ctx->infos[i].ne[0],
                  ctx->infos[i].ne[1],
diff --git a/ggml.h b/ggml.h

index 52ae6755a89ad2e8746642233fea80a1d149937a..8e6b646066b7a488197becae814d17e504916194 100644 (file)
--- a/ggml.h
+++ b/ggml.h
@@ -1371,8 +1371,13 @@ extern "C" {
              int                   n_dims,
              int                   mode,
              int                   n_ctx,
+            int                   n_orig_ctx,
              float                 freq_base,
              float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
              float                 xpos_base,
              bool                  xpos_down);
author	Georgi Gerganov <redacted>
	Fri, 17 Nov 2023 08:00:07 +0000 (10:00 +0200)
committer	GitHub <redacted>
	Fri, 17 Nov 2023 08:00:07 +0000 (10:00 +0200)
ggml-alloc.c		patch \| blob \| history
ggml-quants.c		patch \| blob \| history
ggml.c		patch \| blob \| history
ggml.h		patch \| blob \| history