ggml : ggml_alibi() fixes (#113)

author Georgi Gerganov <redacted>

Fri, 28 Apr 2023 17:47:27 +0000 (20:47 +0300)

committer Georgi Gerganov <redacted>

Fri, 28 Apr 2023 17:47:27 +0000 (20:47 +0300)
author Georgi Gerganov <redacted>
Fri, 28 Apr 2023 17:47:27 +0000 (20:47 +0300)
committer Georgi Gerganov <redacted>
Fri, 28 Apr 2023 17:47:27 +0000 (20:47 +0300)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index 25628bebf91ac9a9baec1af9323205f1c9c5ad56..c37540408bce617c5f1867644c8aedc0cc43d70e 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,6 +9,7 @@ if (GGML_ALL_WARNINGS)
              -Wcast-qual                     \
              -Wstrict-prototypes             \
              -Wpointer-arith                 \
+            -Wdouble-promotion              \
              -Wno-unused-function            \
          ")
      else()
diff --git a/src/ggml.c b/src/ggml.c

index a9650086f722af0a6caa21b54ddd3e996a8a51bf..53796bd97d3abd68c36e7a38d23cce4ec5ac0d5d 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -6099,7 +6099,7 @@ struct ggml_tensor * ggml_alibi(
      //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
      struct ggml_tensor * result = ggml_view_tensor(ctx, a);
  
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
      ((int32_t *) b->data)[0] = n_past;
      ((int32_t *) b->data)[1] = n_head;
  
@@ -9340,7 +9340,7 @@ static void ggml_compute_forward_alibi_f32(
          struct ggml_tensor * dst) {
      assert(params->ith == 0);
      assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
+    assert(ggml_nelements(src1) == 2);
  
      if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
          return;
@@ -9348,12 +9348,11 @@ static void ggml_compute_forward_alibi_f32(
  
      const int n_past = ((int32_t *) src1->data)[0];
      const int n_head = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
  
      const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
      const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    const int ne3 = src0->ne[3]; // 1 -> bsz
+    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
  
      const int n  = ggml_nrows(src0);
      const int ne2_ne3 = n/ne1; // ne2*ne3
@@ -9361,36 +9360,37 @@ static void ggml_compute_forward_alibi_f32(
      const int nb0 = src0->nb[0];
      const int nb1 = src0->nb[1];
      const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
+    //const int nb3 = src0->nb[3];
  
      assert(nb0 == sizeof(float));
      assert(ne1+n_past == ne0);
  
      // add alibi to src0 (KQ_scaled)
      const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-    const float m0 = pow(2.0, -8.0 / n_heads_log2_floor);
-    const float m1 = pow(2.0, -4.0 / n_heads_log2_floor);
+
+    const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
  
      for (int i = 0; i < ne0; i++) {
          for (int j = 0; j < ne1; j++) {
              for (int k = 0; k < ne2_ne3; k++) {
                  float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                float * dst_data  = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
  
                  // TODO: k*nb2 or k*nb3
  
                  float m_k;
+
                  if (k < n_heads_log2_floor) {
-                    m_k = pow(m0, k + 1);
+                    m_k = powf(m0, k + 1);
                  } else {
-                    m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                  }
-                //TODO: optimize
-                dst_data[0] = (j+1) * m_k + src[0];
+
+                pdst[0] = (j+1) * m_k + src[0];
              }
          }
      }
-
  }
  
  
@@ -9401,7 +9401,7 @@ static void ggml_compute_forward_alibi_f16(
          struct ggml_tensor * dst) {
      assert(params->ith == 0);
      assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
+    assert(ggml_nelements(src1) == 2);
  
      if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
          return;
@@ -9409,12 +9409,11 @@ static void ggml_compute_forward_alibi_f16(
  
      const int n_past = ((int32_t *) src1->data)[0];
      const int n_head = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
  
      const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
      const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    const int ne3 = src0->ne[3]; // 1 -> bsz
+    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
  
      const int n  = ggml_nrows(src0);
      const int ne2_ne3 = n/ne1; // ne2*ne3
@@ -9422,36 +9421,38 @@ static void ggml_compute_forward_alibi_f16(
      const int nb0 = src0->nb[0];
      const int nb1 = src0->nb[1];
      const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
+    //const int nb3 = src0->nb[3];
  
-    assert(nb0 == sizeof(float));
+    assert(nb0 == sizeof(ggml_fp16_t));
      assert(ne1+n_past == ne0);
  
      // add alibi to src0 (KQ_scaled)
      const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-    const ggml_fp16_t m0 = pow(2.0, -8.0 / n_heads_log2_floor);
-    const ggml_fp16_t m1 = pow(2.0, -4.0 / n_heads_log2_floor);
+
+    const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
  
      for (int i = 0; i < ne0; i++) {
          for (int j = 0; j < ne1; j++) {
              for (int k = 0; k < ne2_ne3; k++) {
-                ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
  
                  // TODO: k*nb2 or k*nb3
  
-                ggml_fp16_t m_k;
+                float m_k;
+
                  if (k < n_heads_log2_floor) {
-                    m_k = pow(m0, k + 1);
+                    m_k = powf(m0, k + 1);
                  } else {
-                    m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                  }
-                //TODO: optimize
-                dst_data[0] = (j+1) * m_k + src[0];
+
+                // we return F32
+                pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
              }
          }
      }
-
  }
  
  static void ggml_compute_forward_alibi(
@@ -9472,7 +9473,10 @@ static void ggml_compute_forward_alibi(
          case GGML_TYPE_Q4_1:
          case GGML_TYPE_Q4_2:
          case GGML_TYPE_Q4_3:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
          case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
          case GGML_TYPE_I8:
          case GGML_TYPE_I16:
          case GGML_TYPE_I32:
author	Georgi Gerganov <redacted>
	Fri, 28 Apr 2023 17:47:27 +0000 (20:47 +0300)
committer	Georgi Gerganov <redacted>
	Fri, 28 Apr 2023 17:47:27 +0000 (20:47 +0300)
src/CMakeLists.txt		patch \| blob \| history
src/ggml.c		patch \| blob \| history