GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
+ GGML_OP_PAD_REFLECT_1D,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
int p2,
int p3);
+ // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
+ GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0,
+ int p1);
+
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
}
}
+// ggml_compute_forward_pad_reflect_1d
+
+static void ggml_compute_forward_pad_reflect_1d(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ const int p0 = opts[0];
+ const int p1 = opts[1];
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
+ for (int64_t i2 = 0; i2 < ne2; i2++) {
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+ float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0);
+ float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
+
+ ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
+
+ for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
+ for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
+ }
+ }
+ }
+}
// ggml_compute_forward_arange
{
ggml_compute_forward_pad(params, tensor);
} break;
+ case GGML_OP_PAD_REFLECT_1D:
+ {
+ ggml_compute_forward_pad_reflect_1d(params, tensor);
+ } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
+ GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
case GGML_OP_POOL_2D:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
+ case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
const int nth = MIN(1024, ne0);
+ [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+ } break;
+ case GGML_OP_PAD_REFLECT_1D:
+ {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[0];
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[1];
+
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
+
+ [encoder setComputePipelineState:pipeline];
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
+ [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+ [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+ [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+ [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+ [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6];
+ [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+ [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+ [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+ [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+ [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11];
+ [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12];
+ [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13];
+ [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14];
+ [encoder setBytes:&p0 length:sizeof(p0) atIndex:15];
+ [encoder setBytes:&p1 length:sizeof(p1) atIndex:16];
+
+ const int nth = MIN(1024, ne0);
+
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARANGE:
}
}
+kernel void kernel_pad_reflect_1d_f32(
+ device const char * src0,
+ device char * dst,
+ constant int64_t & ne00,
+ constant int64_t & ne01,
+ constant int64_t & ne02,
+ constant int64_t & ne03,
+ constant int64_t & ne0,
+ constant uint64_t & nb00,
+ constant uint64_t & nb01,
+ constant uint64_t & nb02,
+ constant uint64_t & nb03,
+ constant uint64_t & nb0,
+ constant uint64_t & nb1,
+ constant uint64_t & nb2,
+ constant uint64_t & nb3,
+ constant int32_t & p0,
+ constant int32_t & p1,
+ uint3 tgpig[[threadgroup_position_in_grid]],
+ uint3 tgpg[[threadgroups_per_grid]],
+ uint3 tpitg[[thread_position_in_threadgroup]],
+ uint3 ntg[[threads_per_threadgroup]]) {
+
+ const int64_t i3 = tgpig.z;
+ const int64_t i2 = tgpig.y;
+ const int64_t i1 = tgpig.x;
+
+ const int64_t i03 = i3;
+ const int64_t i02 = i2;
+ const int64_t i01 = i1;
+
+ device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
+ device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
+
+ if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
+ for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+ if (i0 < p0) {
+ dst_ptr[i0] = src0_ptr[p0 - i0];
+ } else if (i0 < ne0 - p1) {
+ dst_ptr[i0] = src0_ptr[i0 - p0];
+ } else {
+ dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1];
+ }
+ }
+ }
+}
+
kernel void kernel_arange_f32(
device char * dst,
constant int64_t & ne0,
"POOL_2D_BACK",
"UPSCALE",
"PAD",
+ "PAD_REFLECT_1D",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
"OPT_STEP_ADAMW",
};
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
"pool_2d_back(x)",
"upscale(x)",
"pad(x)",
+ "pad_reflect_1d(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
"adamw(x)",
};
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
return result;
}
+// ggml_pad_reflect_1d
+
+struct ggml_tensor * ggml_pad_reflect_1d(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int p0,
+ int p1) {
+ GGML_ASSERT(p0 >= 0);
+ GGML_ASSERT(p1 >= 0);
+
+ GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
+ GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
+
+ GGML_ASSERT(ggml_is_contiguous(a));
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
+
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
+ a->ne[0] + p0 + p1,
+ a->ne[1],
+ a->ne[2],
+ a->ne[3]);
+
+ int32_t params[] = { p0, p1 };
+ ggml_set_op_params(result, params, sizeof(params));
+
+ result->op = GGML_OP_PAD_REFLECT_1D;
+ result->src[0] = a;
+
+ return result;
+}
+
// ggml_arange
struct ggml_tensor * ggml_arange(
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+#
+# test-pad-reflect-1d
+
+set(TEST_TARGET test-pad-reflect-1d)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
#
# test-conv-transpose
}
};
+// GGML_OP_PAD_REFLECT_1D
+struct test_pad_reflect_1d : public test_case {
+ const ggml_type type;
+ const std::array<int64_t, 4> ne_a;
+ const int pad_0;
+ const int pad_1;
+
+ std::string vars() override {
+ return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
+ }
+
+ test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
+ std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
+ int pad_0 = 10, int pad_1 = 9)
+ : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
+
+ ggml_tensor * build_graph(ggml_context * ctx) override {
+ ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
+ ggml_set_name(a, "a");
+
+ ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1);
+ ggml_set_name(out, "out");
+
+ return out;
+ }
+};
+
// GGML_OP_ARANGE
struct test_arange : public test_case {
const ggml_type type;
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad());
+ test_cases.emplace_back(new test_pad_reflect_1d());
test_cases.emplace_back(new test_arange());
test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu());
--- /dev/null
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+ (void) level;
+ (void) user_data;
+ fputs(text, stderr);
+ fflush(stderr);
+}
+
+struct ggml_context* make_ctx(void) {
+ struct ggml_init_params params = {
+ .mem_size = 2 * 1024 * 1024,
+ };
+ return ggml_init(params);
+}
+
+void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) {
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
+ GGML_ASSERT(t->ne[0] == ne0);
+ GGML_ASSERT(t->ne[1] == ne1);
+ GGML_ASSERT(t->ne[2] == ne2);
+ for (int i2 = 0; i2 < ne2; ++i2) {
+ for (int i1 = 0; i1 < ne1; ++i1) {
+ for (int i0 = 0; i0 < ne0; ++i0) {
+ float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0);
+ float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0];
+ if (expected != actual) {
+ printf("expected %.1f, got %.1f at (%d,%d,%d)\n", expected, actual, i0, i1, i2);
+ }
+ GGML_ASSERT(expected == actual);
+ }
+ }
+ }
+}
+
+void test_pad_reflect_1d(bool use_gpu) {
+ ggml_backend_t backend = NULL;
+ struct ggml_init_params params;
+ ggml_backend_buffer_t buffer;
+ struct ggml_context * ctx;
+ struct ggml_tallocr tallocr;
+ ggml_gallocr_t gallocr;
+
+ // initialize the backend
+#ifdef GGML_USE_CUDA
+ if (use_gpu) {
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
+ backend = ggml_backend_cuda_init(0);
+ if (!backend) {
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+ }
+ }
+#endif
+
+#ifdef GGML_USE_METAL
+ if (use_gpu) {
+ fprintf(stderr, "%s: using Metal backend\n", __func__);
+ backend = ggml_backend_metal_init();
+ if (!backend) {
+ fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+ }
+ }
+#endif
+
+ if (!backend) {
+ fprintf(stderr, "%s: using CPU backend\n", __func__);
+ backend = ggml_backend_cpu_init();
+ }
+
+ // Test cases for different padding configurations
+ {
+ params = (struct ggml_init_params){
+ .mem_size = 16*1024*1024,
+ .mem_buffer = NULL,
+ .no_alloc = true,
+ };
+
+ ggml_log_set(ggml_log_callback_default, nullptr);
+
+ ctx = ggml_init(params);
+ buffer = ggml_backend_alloc_buffer(backend, 16*1024*1024);
+ tallocr = ggml_tallocr_new(buffer);
+ gallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+
+ // Create a simple 1D input tensor [1, 2, 3, 4]
+ struct ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4);
+ float input_data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+ ggml_tallocr_alloc(&tallocr, t);
+
+ // load data to buffer
+ if(ggml_backend_is_cpu(backend)) {
+ memcpy(t->data, input_data, ggml_nbytes(t));
+ } else {
+ ggml_backend_tensor_set(t, input_data, 0, ggml_nbytes(t));
+ }
+
+ // Test case 1: pad left=1, right=1
+ // Expected: [2, 1, 2, 3, 4, 3]
+ float expected_1[] = {2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f};
+ struct ggml_tensor * out_1 = ggml_pad_reflect_1d(ctx, t, 1, 1);
+
+ // Test case 2: pad left=2, right=1
+ // Expected: [3, 2, 1, 2, 3, 4, 3]
+ float expected_2[] = {3.0f, 2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f};
+ struct ggml_tensor * out_2 = ggml_pad_reflect_1d(ctx, t, 2, 1);
+
+ // Test case 3: pad left=1, right=2
+ // Expected: [2, 1, 2, 3, 4, 3, 2]
+ float expected_3[] = {2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 3.0f, 2.0f};
+ struct ggml_tensor * out_3 = ggml_pad_reflect_1d(ctx, t, 1, 2);
+
+ struct ggml_cgraph * gf = ggml_new_graph(ctx);
+ ggml_build_forward_expand(gf, out_1);
+ ggml_build_forward_expand(gf, out_2);
+ ggml_build_forward_expand(gf, out_3);
+
+ ggml_gallocr_alloc_graph(gallocr, gf);
+
+ ggml_backend_graph_compute(backend, gf);
+
+ check_tensor(out_1, expected_1, 6, 1, 1);
+ check_tensor(out_2, expected_2, 7, 1, 1);
+ check_tensor(out_3, expected_3, 7, 1, 1);
+
+ ggml_free(ctx);
+ ggml_backend_buffer_free(buffer);
+ ggml_gallocr_free(gallocr);
+ }
+
+ {
+ params = (struct ggml_init_params){
+ .mem_size = 16*1024*1024,
+ .mem_buffer = NULL,
+ .no_alloc = true,
+ };
+
+ ggml_log_set(ggml_log_callback_default, nullptr);
+
+ ctx = ggml_init(params);
+ buffer = ggml_backend_alloc_buffer(backend, 16*1024*1024);
+ tallocr = ggml_tallocr_new(buffer);
+ gallocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+
+ // Create a 2D input tensor (5 columns × 4 rows)
+ struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 5, 4);
+ float input_data[] = {
+ 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, // row 1
+ 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, // row 2
+ 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, // row 3
+ 16.0f, 17.0f, 18.0f, 19.0f, 20.0f // row 4
+ };
+ ggml_tallocr_alloc(&tallocr, t);
+
+ // load data to buffer
+ if(ggml_backend_is_cpu(backend)) {
+ memcpy(t->data, input_data, ggml_nbytes(t));
+ } else {
+ ggml_backend_tensor_set(t, input_data, 0, ggml_nbytes(t));
+ }
+
+ // Test case 4: pad left=3, right=2 on a 2D tensor
+ // Each row should be padded independently
+ float expected_4[] = {
+ 4.0f, 3.0f, 2.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 4.0f, 3.0f, // row 1
+ 9.0f, 8.0f, 7.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 9.0f, 8.0f, // row 2
+ 14.0f, 13.0f, 12.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 14.0f, 13.0f, // row 3
+ 19.0f, 18.0f, 17.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 19.0f, 18.0f // row 4
+ };
+ struct ggml_tensor * out_4 = ggml_pad_reflect_1d(ctx, t, 3, 2);
+
+ struct ggml_cgraph * gf = ggml_new_graph(ctx);
+ ggml_build_forward_expand(gf, out_4);
+
+ ggml_gallocr_alloc_graph(gallocr, gf);
+
+ ggml_backend_graph_compute(backend, gf);
+
+ check_tensor(out_4, expected_4, 10, 4, 1);
+
+ ggml_free(ctx);
+ ggml_gallocr_free(gallocr);
+ ggml_backend_buffer_free(buffer);
+ }
+
+ ggml_backend_free(backend);
+}
+
+int main(int argc, const char * argv[]) {
+ bool use_gpu = false;
+ if (argc > 1) {
+ use_gpu = strcmp(argv[1], "--gpu") == 0;
+ }
+ test_pad_reflect_1d(use_gpu);
+ return 0;
+}