"CLAMP",
"CONV_1D",
"CONV_2D",
+ "POOL_1D",
+ "POOL_2D",
"FLASH_ATTN",
"FLASH_FF",
"CROSS_ENTROPY_LOSS_BACK",
};
-static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"clamp(x)",
"conv_1d(x)",
"conv_2d(x)",
+ "pool_1d(x)",
+ "pool_2d(x)",
"flash_attn(x)",
"flash_ff(x)",
"cross_entropy_loss_back(x,y)",
};
-static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
+static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
}
+
+// ggml_pool_*
+
+static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
+ return (ins + 2 * p - ks) / s + 1;
+}
+
+// ggml_pool_2d
+
+struct ggml_tensor* ggml_pool_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_pool_op op,
+ int k0,
+ int s0,
+ int p0) {
+
+ bool is_node = false;
+
+ if (a->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ const int64_t ne[3] = {
+ ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+ a->ne[1],
+ };
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
+
+ ggml_scratch_save(ctx);
+ struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
+ ((int32_t*)c->data)[0] = op;
+ ((int32_t*)c->data)[1] = k0;
+ ((int32_t*)c->data)[2] = s0;
+ ((int32_t*)c->data)[3] = p0;
+ ggml_scratch_load(ctx);
+
+ result->op = GGML_OP_POOL_1D;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = c;
+
+ return result;
+}
+
+// ggml_pool_2d
+
+struct ggml_tensor* ggml_pool_2d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ enum ggml_pool_op op,
+ int k0,
+ int k1,
+ int s0,
+ int s1,
+ int p0,
+ int p1) {
+
+ bool is_node = false;
+
+ if (a->grad) {
+ GGML_ASSERT(false); // TODO: implement backward
+ is_node = true;
+ }
+
+ const int64_t ne[3] = {
+ ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
+ ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
+ a->ne[2],
+ };
+ struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
+
+ ggml_scratch_save(ctx);
+ struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
+ ((int32_t*)c->data)[0] = op;
+ ((int32_t*)c->data)[1] = k0;
+ ((int32_t*)c->data)[2] = k1;
+ ((int32_t*)c->data)[3] = s0;
+ ((int32_t*)c->data)[4] = s1;
+ ((int32_t*)c->data)[5] = p0;
+ ((int32_t*)c->data)[6] = p1;
+ ggml_scratch_load(ctx);
+
+ result->op = GGML_OP_POOL_2D;
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->src[0] = a;
+ result->src[1] = c;
+
+ return result;
+}
+
// ggml_flash_attn
struct ggml_tensor * ggml_flash_attn(
};
}
+// ggml_compute_forward_pool_1d_sk_p0
+
+static void ggml_compute_forward_pool_1d_sk_p0(
+ const struct ggml_compute_params * params,
+ const enum ggml_pool_op op,
+ const struct ggml_tensor * src,
+ const int k,
+ struct ggml_tensor * dst) {
+ assert(src->type == GGML_TYPE_F32);
+ assert(params->ith == 0);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const char* cdata = (const char*)src->data;
+ const char* const data_end = cdata + ggml_nbytes(src);
+ float* drow = (float*)dst->data;
+ const int64_t rs = dst->ne[0];
+ while (cdata < data_end) {
+ const float* const srow = (const float*)cdata;
+
+ int j = 0;
+ static_assert(GGML_NUM_POOL_OPS == 2, "GGML_NUM_POOL_OPS != 2");
+ for (int64_t i = 0; i < rs; ++i) {
+ switch (op) {
+ case GGML_POOL_AVG:
+ drow[i] = 0;
+ break;
+ case GGML_POOL_MAX:
+ drow[i] = -FLT_MAX;
+ break;
+ }
+ for (int ki = 0; ki < k; ++ki) {
+ switch (op) {
+ case GGML_POOL_AVG:
+ drow[i] += srow[j];
+ break;
+ case GGML_POOL_MAX:
+ if (srow[j] > drow[i]) drow[i] = srow[j];
+ break;
+ }
+ ++j;
+ }
+ switch (op) {
+ case GGML_POOL_AVG:
+ drow[i] /= k;
+ break;
+ }
+ }
+
+ cdata += src->nb[1];
+ drow += rs;
+ }
+}
+
+// ggml_compute_forward_pool_1d
+
+static void ggml_compute_forward_pool_1d(
+ const struct ggml_compute_params* params,
+ const struct ggml_tensor* src0,
+ const struct ggml_tensor* opt0,
+ struct ggml_tensor* dst) {
+ GGML_ASSERT(opt0->ne[0] == 4);
+ const int* opts = (const int*)opt0->data;
+ enum ggml_pool_op op = opts[0];
+ const int k0 = opts[1];
+ const int s0 = opts[2];
+ const int p0 = opts[3];
+ GGML_ASSERT(p0 == 0); // padding not supported
+ GGML_ASSERT(k0 == s0); // only s = k supported
+
+ ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
+}
+
+// ggml_compute_forward_pool_2d_sk_p0
+
+static void ggml_compute_forward_pool_2d_sk_p0(
+ const struct ggml_compute_params * params,
+ const enum ggml_pool_op op,
+ const struct ggml_tensor * src,
+ const int k0,
+ const int k1,
+ struct ggml_tensor * dst) {
+ assert(src->type == GGML_TYPE_F32);
+ assert(params->ith == 0);
+
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return;
+ }
+
+ const char* cdata = (const char*)src->data;
+ const char* const data_end = cdata + ggml_nbytes(src);
+
+ const int64_t px = dst->ne[0];
+ const int64_t py = dst->ne[1];
+ const int64_t pa = px * py;
+ float* dplane = (float*)dst->data;
+
+ const int ka = k0 * k1;
+
+ while (cdata < data_end) {
+ static_assert(GGML_NUM_POOL_OPS == 2, "GGML_NUM_POOL_OPS != 2");
+ for (int oy = 0; oy < py; ++oy) {
+ float * const drow = dplane + oy * px;
+ for (int ox = 0; ox < px; ++ox) {
+ float * const out = drow + ox;
+ switch (op) {
+ case GGML_POOL_AVG:
+ *out = 0;
+ break;
+ case GGML_POOL_MAX:
+ *out = -FLT_MAX;
+ break;
+ }
+ const int ix = ox * k0;
+ const int iy = oy * k1;
+ for (int ky = 0; ky < k1; ++ky) {
+ const float* const srow = (const float*)(cdata + src->nb[1] * (iy + ky));
+ for (int kx = 0; kx < k0; ++kx) {
+ int j = ix + kx;
+ switch (op) {
+ case GGML_POOL_AVG:
+ *out += srow[j];
+ break;
+ case GGML_POOL_MAX:
+ if (srow[j] > *out) *out = srow[j];
+ break;
+ }
+ }
+ }
+ switch (op) {
+ case GGML_POOL_AVG:
+ *out /= ka;
+ break;
+ }
+ }
+ }
+
+ cdata += src->nb[2];
+ dplane += pa;
+ }
+}
+
+// ggml_compute_forward_pool_2d
+
+static void ggml_compute_forward_pool_2d(
+ const struct ggml_compute_params * params,
+ const struct ggml_tensor * src0,
+ const struct ggml_tensor * opt0,
+ struct ggml_tensor * dst) {
+ GGML_ASSERT(opt0->ne[0] == 7);
+ const int* opts = (const int*)opt0->data;
+ enum ggml_pool_op op = opts[0];
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int s0 = opts[3];
+ const int s1 = opts[4];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
+ GGML_ASSERT(p0 == 0);
+ GGML_ASSERT(p1 == 0); // padding not supported
+ GGML_ASSERT(k0 == s0);
+ GGML_ASSERT(k1 == s1); // only s = k supported
+
+ ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
+}
+
// ggml_compute_forward_flash_attn
{
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
} break;
+ case GGML_OP_POOL_1D:
+ {
+ ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
+ } break;
+ case GGML_OP_POOL_2D:
+ {
+ ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
+ } break;
case GGML_OP_FLASH_ATTN:
{
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
{
GGML_ASSERT(false); // TODO: not implemented
} break;
+ case GGML_OP_POOL_1D:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
+ case GGML_OP_POOL_2D:
+ {
+ GGML_ASSERT(false); // TODO: not implemented
+ } break;
case GGML_OP_FLASH_ATTN:
{
struct ggml_tensor * flash_grad = NULL;
work_size = MAX(work_size, cur);
} break;
+ case GGML_OP_POOL_1D:
+ case GGML_OP_POOL_2D:
+ {
+ n_tasks = 1;
+ } break;
case GGML_OP_FLASH_ATTN:
{
n_tasks = n_threads;
--- /dev/null
+#include "ggml/ggml.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct ggml_context* make_ctx(void) {
+ struct ggml_init_params params = {
+ .mem_size = 2 * 1024 * 1024,
+ };
+
+ return ggml_init(params);
+}
+
+int main(int argc, const char** argv) {
+
+ float buf_f32[1024];
+ for (int i = 0; i < 1024; ++i) {
+ buf_f32[i] = (float)(i + 1);
+ }
+
+ // avg pool 1d
+ {
+ struct ggml_context* ctx = make_ctx();
+ struct ggml_tensor* t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+ memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+ struct ggml_tensor* t_pooled = ggml_pool_1d(ctx, t, GGML_POOL_AVG, 3, 3, 0);
+ GGML_ASSERT(t_pooled->ne[0] == 3);
+ GGML_ASSERT(t_pooled->ne[1] == 2);
+ GGML_ASSERT(t_pooled->ne[2] == 1);
+
+ struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+ ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+ const float* output = ggml_get_data_f32(t_pooled);
+ GGML_ASSERT(output[0] == 2);
+ GGML_ASSERT(output[1] == 5);
+ GGML_ASSERT(output[2] == 8);
+ GGML_ASSERT(output[3] == 12);
+ GGML_ASSERT(output[4] == 15);
+ GGML_ASSERT(output[5] == 18);
+
+ ggml_free(ctx);
+ }
+
+ // max pool 1d
+ {
+ struct ggml_context* ctx = make_ctx();
+ struct ggml_tensor* t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
+ memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+ struct ggml_tensor* t_pooled = ggml_pool_1d(ctx, t, GGML_POOL_MAX, 3, 3, 0);
+ GGML_ASSERT(t_pooled->ne[0] == 3);
+ GGML_ASSERT(t_pooled->ne[1] == 2);
+ GGML_ASSERT(t_pooled->ne[2] == 1);
+
+ struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+ ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+ const float* output = ggml_get_data_f32(t_pooled);
+ GGML_ASSERT(output[0] == 3);
+ GGML_ASSERT(output[1] == 6);
+ GGML_ASSERT(output[2] == 9);
+ GGML_ASSERT(output[3] == 13);
+ GGML_ASSERT(output[4] == 16);
+ GGML_ASSERT(output[5] == 19);
+
+ ggml_free(ctx);
+ }
+
+ // avg pool 2d
+ {
+ struct ggml_context* ctx = make_ctx();
+ struct ggml_tensor* t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2);
+ memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+ struct ggml_tensor* t_pooled = ggml_pool_2d(ctx, t, GGML_POOL_AVG, 3, 4, 3, 4, 0, 0);
+ GGML_ASSERT(t_pooled->ne[0] == 3);
+ GGML_ASSERT(t_pooled->ne[1] == 2);
+ GGML_ASSERT(t_pooled->ne[2] == 2);
+ GGML_ASSERT(t_pooled->ne[3] == 1);
+
+ struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+ ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+ const float* output = ggml_get_data_f32(t_pooled);
+ GGML_ASSERT(output[0] == 17);
+ GGML_ASSERT(output[1] == 20);
+ GGML_ASSERT(output[2] == 23);
+ GGML_ASSERT(output[3] == 57);
+ GGML_ASSERT(output[4] == 60);
+ GGML_ASSERT(output[5] == 63);
+ GGML_ASSERT(output[6] == 117);
+ GGML_ASSERT(output[7] == 120);
+ GGML_ASSERT(output[8] == 123);
+ GGML_ASSERT(output[9] == 157);
+ GGML_ASSERT(output[10] == 160);
+ GGML_ASSERT(output[11] == 163);
+
+
+ ggml_free(ctx);
+ }
+
+ // max pool 2d
+ {
+ struct ggml_context* ctx = make_ctx();
+ struct ggml_tensor* t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2);
+ memcpy(t->data, buf_f32, ggml_nbytes(t));
+
+ struct ggml_tensor* t_pooled = ggml_pool_2d(ctx, t, GGML_POOL_MAX, 3, 4, 3, 4, 0, 0);
+ GGML_ASSERT(t_pooled->ne[0] == 3);
+ GGML_ASSERT(t_pooled->ne[1] == 2);
+ GGML_ASSERT(t_pooled->ne[2] == 2);
+ GGML_ASSERT(t_pooled->ne[3] == 1);
+
+ struct ggml_cgraph graph = ggml_build_forward(t_pooled);
+
+ ggml_graph_compute_with_ctx(ctx, &graph, 4);
+
+ const float* output = ggml_get_data_f32(t_pooled);
+ GGML_ASSERT(output[0] == 33);
+ GGML_ASSERT(output[1] == 36);
+ GGML_ASSERT(output[2] == 39);
+ GGML_ASSERT(output[3] == 73);
+ GGML_ASSERT(output[4] == 76);
+ GGML_ASSERT(output[5] == 79);
+ GGML_ASSERT(output[6] == 133);
+ GGML_ASSERT(output[7] == 136);
+ GGML_ASSERT(output[8] == 139);
+ GGML_ASSERT(output[9] == 173);
+ GGML_ASSERT(output[10] == 176);
+ GGML_ASSERT(output[11] == 179);
+
+ ggml_free(ctx);
+ }
+
+ return 0;
+}