examples: add MNIST training + missing ops

author Johannes Gäßler <redacted>

Tue, 30 Jul 2024 13:56:35 +0000 (15:56 +0200)

committer Georgi Gerganov <redacted>

Wed, 28 Aug 2024 10:22:20 +0000 (13:22 +0300)
author Johannes Gäßler <redacted>
Tue, 30 Jul 2024 13:56:35 +0000 (15:56 +0200)
committer Georgi Gerganov <redacted>
Wed, 28 Aug 2024 10:22:20 +0000 (13:22 +0300)
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index 4ea7aa91124aa8166e60dda87f315c11f9e81481..126ce068a020a13b9821a48dc9033984ea2e9cd0 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -220,7 +220,7 @@
  #include <stdio.h>
  
  #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
+#define GGML_FILE_VERSION 2
  
  #define GGML_QNT_VERSION        2    // bump this on quantization format changes
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@@ -490,9 +490,11 @@ extern "C" {
          GGML_OP_CLAMP,
          GGML_OP_CONV_TRANSPOSE_1D,
          GGML_OP_IM2COL,
+        GGML_OP_IM2COL_BACK,
          GGML_OP_CONV_TRANSPOSE_2D,
          GGML_OP_POOL_1D,
          GGML_OP_POOL_2D,
+        GGML_OP_POOL_2D_BACK,
          GGML_OP_UPSCALE, // nearest interpolate
          GGML_OP_PAD,
          GGML_OP_ARANGE,
@@ -1582,34 +1584,49 @@ extern "C" {
              float                 min,
              float                 max);
  
+    // im2col
+    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
      GGML_API struct ggml_tensor * ggml_im2col(
              struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D,
-            enum ggml_type       dst_type);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s0, // stride dimension 0
+            int                   s1, // stride dimension 1
+            int                   p0, // padding dimension 0
+            int                   p1, // padding dimension 1
+            int                   d0, // dilation dimension 0
+            int                   d1, // dilation dimension 1
+            bool                  is_2D,
+            enum ggml_type        dst_type);
+
+    GGML_API struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,  // convolution kernel
+        struct ggml_tensor  * b,  // gradient of im2col output
+        int64_t             * ne, // shape of im2col input
+        int                   s0, // stride dimension 0
+        int                   s1, // stride dimension 1
+        int                   p0, // padding dimension 0
+        int                   p1, // padding dimension 1
+        int                   d0, // dilation dimension 0
+        int                   d1, // dilation dimension 1
+        bool                  is_2D);
  
      GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
              struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
  
      GGML_API struct ggml_tensor * ggml_conv_1d(
              struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
              int                   s0,  // stride
              int                   p0,  // padding
              int                   d0); // dilation
@@ -1618,29 +1635,29 @@ extern "C" {
      // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
      GGML_API struct ggml_tensor* ggml_conv_1d_ph(
              struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s,  // stride
+            int                   d); // dilation
  
      GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
              struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
  
      GGML_API struct ggml_tensor * ggml_conv_2d(
              struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
  
  
      // kernel size is a->ne[0] x a->ne[1]
@@ -1702,6 +1719,18 @@ extern "C" {
              float                 p0,
              float                 p1);
  
+    GGML_API struct ggml_tensor * ggml_pool_2d_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * af, // "a"/input used in forward pass
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+
      // nearest interpolate
      // multiplies ne0 and ne1 by scale factor
      // used in stable-diffusion
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index a56c2ffd9e2de38745ca62201f8cf0d46a932094..07d9d50812fe8500f27790e5ab8e5c0e9829ab17 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2801,9 +2801,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
      "CLAMP",
      "CONV_TRANSPOSE_1D",
      "IM2COL",
+    "IM2COL_BACK",
      "CONV_TRANSPOSE_2D",
      "POOL_1D",
      "POOL_2D",
+    "POOL_2D_BACK",
      "UPSCALE",
      "PAD",
      "ARANGE",
@@ -2837,7 +2839,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
      "CROSS_ENTROPY_LOSS_BACK",
  };
  
-static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
+static_assert(GGML_OP_COUNT == 78, "GGML_OP_COUNT != 78");
  
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
      "none",
@@ -2891,9 +2893,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
      "clamp(x)",
      "conv_transpose_1d(x)",
      "im2col(x)",
+    "im2col_back(x)",
      "conv_transpose_2d(x)",
      "pool_1d(x)",
      "pool_2d(x)",
+    "pool_2d_back(x)",
      "upscale(x)",
      "pad(x)",
      "arange(start, stop, step)",
@@ -2927,7 +2931,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
      "cross_entropy_loss_back(x,y)",
  };
  
-static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
+static_assert(GGML_OP_COUNT == 78, "GGML_OP_COUNT != 78");
  
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
  
@@ -3741,6 +3745,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
  
      size_t data_size = ggml_row_size(type, ne[0]);
      for (int i = 1; i < n_dims; i++) {
+        assert(ne[i] > 0);
          data_size *= ne[i];
      }
  
@@ -3773,6 +3778,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
      }
  
      struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
+    GGML_ASSERT(obj_new);
  
      // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
  
@@ -4492,8 +4498,6 @@ static struct ggml_tensor * ggml_add_impl(
      bool is_node = false;
  
      if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
          is_node = true;
      }
  
@@ -6801,17 +6805,20 @@ struct ggml_tensor * ggml_im2col(
          GGML_ASSERT(a->ne[2] == b->ne[2]);
      } else {
          GGML_ASSERT(a->ne[1] == b->ne[1]);
+        GGML_ASSERT(b->ne[3] == 1);
      }
      bool is_node = false;
  
-    if (a->grad || b->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
+    if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data
          is_node = true;
      }
  
      const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
      const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
  
+    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+
      const int64_t ne[4] = {
          is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
          OW,
@@ -6831,6 +6838,37 @@ struct ggml_tensor * ggml_im2col(
      return result;
  }
  
+struct ggml_tensor * ggml_im2col_back(
+    struct ggml_context * ctx,
+    struct ggml_tensor  * a,
+    struct ggml_tensor  * b,
+    int64_t             * ne,
+    int                   s0,
+    int                   s1,
+    int                   p0,
+    int                   p1,
+    int                   d0,
+    int                   d1,
+    bool                  is_2D) {
+
+    bool is_node = false;
+
+    if (/*a->grad ||*/ b->grad) { // a is only used for its shape, not its data
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_IM2COL_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
  // a: [OC，IC, KH, KW]
  // b: [N, IC, IH, IW]
  // result: [N, OC, OH, OW]
@@ -6844,7 +6882,7 @@ struct ggml_tensor * ggml_conv_2d(
          int                  p1,
          int                  d0,
          int                  d1) {
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
  
      struct ggml_tensor * result =
          ggml_mul_mat(ctx,
@@ -6970,17 +7008,17 @@ struct ggml_tensor * ggml_pool_2d(
      bool is_node = false;
  
      if (a->grad) {
-        GGML_ABORT("fatal error"); // TODO: implement backward
          is_node = true;
      }
  
      struct ggml_tensor * result;
-    const int64_t ne[3] = {
+    const int64_t ne[4] = {
          ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
          ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
          a->ne[2],
+        a->ne[3],
      };
-    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  
      int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
      ggml_set_op_params(result, params, sizeof(params));
@@ -6991,6 +7029,37 @@ struct ggml_tensor * ggml_pool_2d(
      return result;
  }
  
+struct ggml_tensor * ggml_pool_2d_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * af,
+        enum ggml_op_pool     op,
+        int                   k0,
+        int                   k1,
+        int                   s0,
+        int                   s1,
+        float                 p0,
+        float                 p1) {
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result;
+    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
+
+    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op = GGML_OP_POOL_2D_BACK;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = af;
+    return result;
+}
+
  // ggml_upscale
  
  static struct ggml_tensor * ggml_upscale_impl(
@@ -14714,6 +14783,7 @@ static void ggml_compute_forward_conv_transpose_1d(
      }
  }
  
+// ggml_compute_forward_im2col_f32
  // src0: kernel [OC, IC, KH, KW]
  // src1: image [N, IC, IH, IW]
  // dst:  result [N, OH, OW, IC*KH*KW]
@@ -14724,7 +14794,6 @@ static void ggml_compute_forward_im2col_f32(
      const struct ggml_tensor * src0 = dst->src[0];
      const struct ggml_tensor * src1 = dst->src[1];
  
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
      GGML_ASSERT(src1->type == GGML_TYPE_F32);
      GGML_ASSERT( dst->type == GGML_TYPE_F32);
  
@@ -14755,7 +14824,6 @@ static void ggml_compute_forward_im2col_f32(
      int ofs0 = is_2D ? nb13 : nb12;
      int ofs1 = is_2D ? nb12 : nb11;
  
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
      GGML_ASSERT(nb10 == sizeof(float));
  
      // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -14791,6 +14859,7 @@ static void ggml_compute_forward_im2col_f32(
  }
  
  
+// ggml_compute_forward_im2col_f16
  // src0: kernel [OC, IC, KH, KW]
  // src1: image [N, IC, IH, IW]
  // dst:  result [N, OH, OW, IC*KH*KW]
@@ -14886,6 +14955,99 @@ static void ggml_compute_forward_im2col(
      }
  }
  
+// ggml_compute_forward_im2col_back_f32
+
+static void ggml_compute_forward_im2col_back_f32(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t N  = is_2D ? ne3 : ne2;
+    const int64_t IC = is_2D ? ne2 : ne1;
+    const int64_t IH = is_2D ? ne1 : 1;
+    const int64_t IW = ne0;
+
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
+
+    const int64_t OH = is_2D ? ne12 : 1;
+    const int64_t OW = ne11;
+
+    int ofs0 = is_2D ? nb3 : nb2;
+    int ofs1 = is_2D ? nb2 : nb1;
+
+    GGML_ASSERT(nb0  == sizeof(float));
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        float * const wdata = (float *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t iic = ith; iic < IC; iic += nth) {
+                for (int64_t iih = 0; iih < IH; iih++) {
+                    for (int64_t iiw = 0; iiw < IW; iiw++) {
+
+                        // micro kernel
+                        float grad = 0.0f;
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                // For s0 > 1 some values were skipped over in the forward pass.
+                                // These values have tmpw % s0 != 0 and need to be skipped in the backwards pass as well.
+                                const int64_t tmpw = (iiw + p0 - ikw*d0);
+                                if (tmpw % s0 != 0) {
+                                    continue;
+                                }
+                                const int64_t iow = tmpw / s0;
+
+                                // Equivalent logic as above except for s1.
+                                int64_t ioh;
+                                if (is_2D) {
+                                    const int64_t tmph = iih + p1 - ikh*d1;
+
+                                    if (tmph % s1 != 0) {
+                                        continue;
+                                    }
+
+                                    ioh = tmph / s1;
+                                } else {
+                                    ioh = 0;
+                                }
+
+                                if (iow < 0 || iow >= OW || ioh < 0 || ioh >= OH) {
+                                    continue;
+                                }
+
+                                const float * const src_data = (const float *) src1->data
+                                    + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                                grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
+                            }
+                        }
+                        float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
+                        dst_data[iih*IW + iiw] = grad;
+                    }
+                }
+            }
+        }
+    }
+}
  
  // ggml_compute_forward_conv_transpose_2d
  
@@ -15128,6 +15290,128 @@ static void ggml_compute_forward_pool_2d(
      }
  }
  
+// ggml_compute_forward_pool_2d_back
+
+static void ggml_compute_forward_pool_2d_back(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src  = dst->src[0];
+    const struct ggml_tensor * dstf = dst->src[1]; // forward tensor of dst
+
+    assert(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    const int32_t * opts = (const int32_t *)dst->op_params;
+    enum ggml_op_pool op = opts[0];
+    const int k0 = opts[1];
+    const int k1 = opts[2];
+    const int s0 = opts[3];
+    const int s1 = opts[4];
+    const int p0 = opts[5];
+    const int p1 = opts[6];
+
+    char       * cdata  = (char       *) dst->data;
+    const char * cdataf = (const char *) dstf->data;
+    const char * const data_end = cdata + ggml_nbytes(dst);
+
+    GGML_ASSERT(params->ith == 0);
+    memset(cdata, 0, ggml_nbytes(dst));
+
+    const int64_t px = src->ne[0];
+    const int64_t py = src->ne[1];
+    const int64_t pa = px * py;
+
+    const float * splane = (const float *) src->data;
+
+    const int ka = k0 * k1;
+    const int offset0 = -p0;
+    const int offset1 = -p1;
+
+    while (cdata < data_end) {
+        for (int oy = 0; oy < py; ++oy) {
+            const float * const srow = splane + oy * px;
+            for (int ox = 0; ox < px; ++ox) {
+                const float grad0 = srow[ox];
+
+                const int ix = offset0 + ox * s0;
+                const int iy = offset1 + oy * s1;
+
+                if (op == GGML_OP_POOL_MAX) {
+                    float maxval = -FLT_MAX;
+                    int kxmax = -1;
+                    int kymax = -1;
+
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        const void * drowf = (const void *)(cdataf + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+
+                            const float val = dst->type == GGML_TYPE_F32 ?
+                                ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]);
+                            if (val <= maxval) {
+                                continue;
+                            }
+
+                            maxval = val;
+                            kxmax = kx;
+                            kymax = ky;
+                        }
+                    }
+
+                    if (kxmax == -1 || kymax == -1) {
+                        continue;
+                    }
+
+                    void * drow = (void *)(cdata + dst->nb[1] * (iy + kymax));
+                    const int j = ix + kxmax;
+                    if (dst->type == GGML_TYPE_F32) {
+                        ((float *) drow)[j] += grad0;
+                    } else {
+                        ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j]));
+                    }
+                } else if (op == GGML_OP_POOL_AVG) {
+                    const float grad = grad0 / ka;
+
+                    for (int ky = 0; ky < k1; ++ky) {
+                        if (iy + ky < 0 || iy + ky >= dst->ne[1]) {
+                            continue;
+                        }
+                        void * drow = (void *)(cdata + dst->nb[1] * (iy + ky));
+                        for (int kx = 0; kx < k0; ++kx) {
+                            int j = ix + kx;
+                            if (j < 0 || j >= dst->ne[0]) {
+                                continue;
+                            }
+
+                            if (dst->type == GGML_TYPE_F32) {
+                                ((float *) drow)[j] += grad;
+                            } else {
+                                ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad);
+                            }
+                        }
+                    }
+                } else {
+                    GGML_ASSERT(false);
+                }
+            }
+        }
+
+        cdata  += dst->nb[2];
+        cdataf += dst->nb[2];
+        splane += pa;
+    }
+}
+
  // ggml_compute_forward_upscale
  
  static void ggml_compute_forward_upscale_f32(
@@ -17097,6 +17381,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
              {
                  ggml_compute_forward_im2col(params, tensor);
              } break;
+        case GGML_OP_IM2COL_BACK:
+            {
+                ggml_compute_forward_im2col_back_f32(params, tensor);
+            } break;
          case GGML_OP_CONV_TRANSPOSE_2D:
              {
                  ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -17109,6 +17397,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
              {
                  ggml_compute_forward_pool_2d(params, tensor);
              } break;
+        case GGML_OP_POOL_2D_BACK:
+            {
+                ggml_compute_forward_pool_2d_back(params, tensor);
+            } break;
          case GGML_OP_UPSCALE:
              {
                  ggml_compute_forward_upscale(params, tensor);
@@ -17477,7 +17769,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                      src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
                  }
                  if (src1->grad) {
-                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
+                    if (ggml_are_same_shape(src0, src1)) {
+                        src1->grad = ggml_add_or_set(ctx, src1->grad,                       tensor->grad,        zero_table);
+                    } else {
+                        src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_repeat_back(ctx, tensor->grad, src1), zero_table);
+                    }
                  }
              } break;
          case GGML_OP_ADD1:
@@ -18074,6 +18370,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                  GGML_ABORT("fatal error"); // TODO: not implemented
              }
          case GGML_OP_IM2COL:
+            {
+                if (src1->grad) {
+                    const int32_t s0    = ggml_get_op_params_i32(tensor, 0);
+                    const int32_t s1    = ggml_get_op_params_i32(tensor, 1);
+                    const int32_t p0    = ggml_get_op_params_i32(tensor, 2);
+                    const int32_t p1    = ggml_get_op_params_i32(tensor, 3);
+                    const int32_t d0    = ggml_get_op_params_i32(tensor, 4);
+                    const int32_t d1    = ggml_get_op_params_i32(tensor, 5);
+                    const bool    is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
+
+                    src1->grad = ggml_add_or_set(ctx,
+                            src1->grad,
+                            ggml_im2col_back(ctx, src0, tensor->grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_IM2COL_BACK:
              {
                  GGML_ABORT("fatal error"); // TODO: not implemented
              }
@@ -18086,6 +18399,23 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                  GGML_ABORT("fatal error"); // TODO: not implemented
              }
          case GGML_OP_POOL_2D:
+            {
+                if (src0->grad) {
+                    const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, 0);
+                    const      int32_t      k0 = ggml_get_op_params_i32(tensor, 1);
+                    const      int32_t      k1 = ggml_get_op_params_i32(tensor, 2);
+                    const      int32_t      s0 = ggml_get_op_params_i32(tensor, 3);
+                    const      int32_t      s1 = ggml_get_op_params_i32(tensor, 4);
+                    const      int32_t      p0 = ggml_get_op_params_i32(tensor, 5);
+                    const      int32_t      p1 = ggml_get_op_params_i32(tensor, 6);
+
+                    src0->grad = ggml_add_or_set(ctx,
+                            src0->grad,
+                            ggml_pool_2d_back(ctx, tensor->grad, src0, op, k0, k1, s0, s1, p0, p1),
+                            zero_table);
+                }
+            } break;
+        case GGML_OP_POOL_2D_BACK:
              {
                  GGML_ABORT("fatal error"); // TODO: not implemented
              }
@@ -18375,6 +18705,7 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
  
  void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
      GGML_ASSERT(gf->n_nodes > 0);
+    GGML_ASSERT(gf->grads);
  
      // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
      if (keep) {
@@ -18802,6 +19133,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                  n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
              } break;
          case GGML_OP_IM2COL:
+        case GGML_OP_IM2COL_BACK:
          case GGML_OP_CONV_TRANSPOSE_1D:
          case GGML_OP_CONV_TRANSPOSE_2D:
              {
@@ -18809,6 +19141,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
              } break;
          case GGML_OP_POOL_1D:
          case GGML_OP_POOL_2D:
+        case GGML_OP_POOL_2D_BACK:
              {
                  n_tasks = 1;
              } break;
@@ -19322,9 +19655,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
  
                  const uint32_t type   = tensor->type;
                  const uint32_t op     = tensor->op;
+                const int32_t  flags  = tensor->flags;
  
                  fwrite(&type,   sizeof(uint32_t), 1, fout);
                  fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&flags,  sizeof(int32_t),  1, fout);
  
                  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                      const uint64_t ne = tensor->ne[j];
@@ -19354,9 +19689,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
  
                  const uint32_t type   = tensor->type;
                  const uint32_t op     = tensor->op;
+                const int32_t  flags  = tensor->flags;
  
                  fwrite(&type,   sizeof(uint32_t), 1, fout);
                  fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&flags,  sizeof(int32_t),  1, fout);
  
                  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                      const uint64_t ne = tensor->ne[j];
@@ -19415,6 +19752,14 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                          }
                      }
                  }
+
+                // dump the data
+                // TODO: pad this to 32 byte boundary
+                if ((flags & GGML_TENSOR_FLAG_PARAM)) {
+                    const size_t size = ggml_nbytes(tensor);
+
+                    fwrite(tensor->data, sizeof(char), size, fout);
+                }
              }
          }
  
@@ -19528,10 +19873,12 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
          {
              uint32_t type;
              uint32_t op;
+            int32_t  flags;
  
              for (uint32_t i = 0; i < n_leafs; ++i) {
                  type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                  op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                flags  = *(const int32_t  *) ptr; ptr += sizeof(flags);
  
                  int64_t ne[GGML_MAX_DIMS];
                  size_t  nb[GGML_MAX_DIMS];
@@ -19549,20 +19896,19 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
  
                  struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
  
-                tensor->op = (enum ggml_op) op;
+                tensor->op    = (enum ggml_op) op;
+                tensor->flags = flags;
  
                  memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
                  memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
  
-                tensor->data = (void *) ptr;
-
                  for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                      tensor->nb[j] = nb[j];
                  }
  
-                result->leafs[i] = tensor;
+                tensor->data = (void *) ptr; ptr += ggml_nbytes(tensor);
  
-                ptr += ggml_nbytes(tensor);
+                result->leafs[i] = tensor;
  
                  fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
              }
@@ -19574,10 +19920,12 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
          {
              uint32_t type;
              uint32_t op;
+            int32_t  flags;
  
              for (uint32_t i = 0; i < n_nodes; ++i) {
                  type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                  op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                flags  = *(const int32_t  *) ptr; ptr += sizeof(flags);
  
                  enum ggml_op eop = (enum ggml_op) op;
  
@@ -19667,6 +20015,11 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
  
                  result->nodes[i] = tensor;
  
+                // TODO tensor data is be duplicated due to ggml_new_tensor call above
+                if (flags & GGML_TENSOR_FLAG_PARAM) {
+                    tensor->data = (void *) ptr; ptr += ggml_nbytes(tensor);
+                }
+
                  fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
              }
          }
@@ -20701,6 +21054,8 @@ enum ggml_opt_result ggml_opt(
          struct ggml_context * ctx,
          struct ggml_opt_params params,
          struct ggml_tensor * f) {
+    GGML_ASSERT(f->grad && "ggml_set_param called for at least one parent tensor.");
+
      bool free_ctx = false;
      if (ctx == NULL) {
          struct ggml_init_params params_ctx = {
@@ -20755,6 +21110,8 @@ enum ggml_opt_result ggml_opt_resume_g(
          ggml_opt_callback callback,
          void * callback_data) {
  
+    GGML_ASSERT(f->grad && "ggml_set_param must be called for at least one ancestor");
+
      // build forward + backward compute graphs
      enum ggml_opt_result result = GGML_OPT_RESULT_OK;
  
@@ -21842,6 +22199,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
  void gguf_add_tensor(
               struct gguf_context * ctx,
          const struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
      if (gguf_find_tensor(ctx, tensor->name) != -1) {
          GGML_ABORT("duplicated tensor name");
      }
author	Johannes Gäßler <redacted>
	Tue, 30 Jul 2024 13:56:35 +0000 (15:56 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 28 Aug 2024 10:22:20 +0000 (13:22 +0300)
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history