Add `ggml_roll` (#1274)

author Acly <redacted>

Wed, 18 Jun 2025 11:34:50 +0000 (13:34 +0200)

committer GitHub <redacted>

Wed, 18 Jun 2025 11:34:50 +0000 (13:34 +0200)
author Acly <redacted>
Wed, 18 Jun 2025 11:34:50 +0000 (13:34 +0200)
committer GitHub <redacted>
Wed, 18 Jun 2025 11:34:50 +0000 (13:34 +0200)
diff --git a/include/ggml.h b/include/ggml.h

index 1a57f1cd75a31eedb2550c87bedca0d4c175e9dc..9c4e24023b5ad20c1d59d8686a7e03763bc3987c 100644 (file)
--- a/include/ggml.h
+++ b/include/ggml.h
@@ -489,6 +489,7 @@ extern "C" {
          GGML_OP_UPSCALE, // nearest interpolate
          GGML_OP_PAD,
          GGML_OP_PAD_REFLECT_1D,
+        GGML_OP_ROLL,
          GGML_OP_ARANGE,
          GGML_OP_TIMESTEP_EMBEDDING,
          GGML_OP_ARGSORT,
@@ -1801,6 +1802,17 @@ extern "C" {
              int                   p0,
              int                   p1);
  
+    // Move tensor elements by an offset given for each dimension. Elements that
+    // are shifted beyond the last position are wrapped around to the beginning.
+    GGML_API struct ggml_tensor * ggml_roll(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   shift0,
+            int                   shift1,
+            int                   shift2,
+            int                   shift3);
+
+
      // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
      // timesteps: [N,]
      // return: [N, dim]
diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c

index 2c12e493bc9b01aa103d6b5c179298101a46d1a8..3e494bb8cf076235837f69fb42cbd4040494ff99 100644 (file)
--- a/src/ggml-cpu/ggml-cpu.c
+++ b/src/ggml-cpu/ggml-cpu.c
@@ -1967,6 +1967,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
              {
                  ggml_compute_forward_pad_reflect_1d(params, tensor);
              } break;
+        case GGML_OP_ROLL:
+            {
+                ggml_compute_forward_roll(params, tensor);
+            } break;
          case GGML_OP_ARANGE:
              {
                  ggml_compute_forward_arange(params, tensor);
@@ -2291,6 +2295,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
          case GGML_OP_UPSCALE:
          case GGML_OP_PAD:
          case GGML_OP_PAD_REFLECT_1D:
+        case GGML_OP_ROLL:
          case GGML_OP_ARANGE:
          case GGML_OP_TIMESTEP_EMBEDDING:
          case GGML_OP_ARGSORT:
diff --git a/src/ggml-cpu/ops.cpp b/src/ggml-cpu/ops.cpp

index 08facb6d03d5e830fcd36f75c145195bb66d8363..eff4a53e3442b4ffa0ec73f158f4f62091a6b6ca 100644 (file)
--- a/src/ggml-cpu/ops.cpp
+++ b/src/ggml-cpu/ops.cpp
@@ -6793,6 +6793,73 @@ void ggml_compute_forward_pad_reflect_1d(
      }
  }
  
+// ggml_compute_forward_roll
+
+static int64_t ggml_wrap_index(int64_t i, int64_t ne) {
+    if (i < 0) {
+        return i + ne;
+    } else if (i >= ne) {
+        return i - ne;
+    }
+    return i;
+}
+
+static void ggml_compute_forward_roll_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src_data = (const float *) src0->data;
+    float * dst_data = (float *) dst->data;
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    const int s0 = ggml_get_op_params_i32(dst, 0);
+    const int s1 = ggml_get_op_params_i32(dst, 1);
+    const int s2 = ggml_get_op_params_i32(dst, 2);
+    const int s3 = ggml_get_op_params_i32(dst, 3);
+
+    const int64_t total = ne1 * ne2 * ne3;
+    const int64_t per_thread = (total + params->nth) / params->nth;
+    const int64_t start = params->ith * per_thread;
+    const int64_t end   = std::min(start + per_thread, total);
+
+    for (int64_t i = start; i < end; ++i) {
+        const int64_t i1 = i % ne1;
+        const int64_t i2 = (i / ne1) % ne2;
+        const int64_t i3 = i / (ne2 * ne1);
+        float * dst_row = dst_data + (i3*nb3 + i2*nb2 + i1*nb1) / sizeof(float);
+
+        const int64_t i01 = ggml_wrap_index(i1 - s1, ne01);
+        const int64_t i02 = ggml_wrap_index(i2 - s2, ne02);
+        const int64_t i03 = ggml_wrap_index(i3 - s3, ne03);
+        const float * src_row = src_data + (i03*nb03 + i02*nb02 + i01*nb01) / sizeof(float);
+
+        const int64_t s = ggml_wrap_index(-s0, ne00);
+        const int64_t n = ne00 - s;
+        ggml_vec_cpy_f32(n, dst_row,     src_row + s);
+        ggml_vec_cpy_f32(s, dst_row + n, src_row);
+    }
+}
+
+void ggml_compute_forward_roll(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_roll_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
  // ggml_compute_forward_arange
  
  static void ggml_compute_forward_arange_f32(
diff --git a/src/ggml-cpu/ops.h b/src/ggml-cpu/ops.h

index dc081b9e663970458606651a43bd640a187641c2..2d8544d7d3d436afd00eae1a149ad308c1d58690 100644 (file)
--- a/src/ggml-cpu/ops.h
+++ b/src/ggml-cpu/ops.h
@@ -72,6 +72,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
  void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
  void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
  void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
  void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
  void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
  void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/src/ggml.c b/src/ggml.c

index a8edad3778aa9dab82903d913109ebeb51fc992d..f8e7c595bce15dd214c8ef366bdd56087e3eff78 100644 (file)
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -955,6 +955,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
      "UPSCALE",
      "PAD",
      "PAD_REFLECT_1D",
+    "ROLL",
      "ARANGE",
      "TIMESTEP_EMBEDDING",
      "ARGSORT",
@@ -985,7 +986,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
      "OPT_STEP_ADAMW",
  };
  
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
  
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
      "none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
      "upscale(x)",
      "pad(x)",
      "pad_reflect_1d(x)",
+    "roll(x)",
      "arange(start, stop, step)",
      "timestep_embedding(timesteps, dim, max_period)",
      "argsort(x)",
@@ -1080,7 +1082,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
      "adamw(x)",
  };
  
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
  
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
  
@@ -4341,6 +4343,34 @@ struct ggml_tensor * ggml_pad_reflect_1d(
      return result;
  }
  
+// ggml_roll
+
+struct ggml_tensor * ggml_roll(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   shift0,
+        int                   shift1,
+        int                   shift2,
+        int                   shift3) {
+    GGML_ASSERT(a->nb[0] == ggml_type_size(a->type));
+    GGML_ASSERT(abs(shift0) < a->ne[0]);
+    GGML_ASSERT(abs(shift1) < a->ne[1]);
+    GGML_ASSERT(abs(shift2) < a->ne[2]);
+    GGML_ASSERT(abs(shift3) < a->ne[3]);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, shift0);
+    ggml_set_op_params_i32(result, 1, shift1);
+    ggml_set_op_params_i32(result, 2, shift2);
+    ggml_set_op_params_i32(result, 3, shift3);
+
+    result->op     = GGML_OP_ROLL;
+    result->src[0] = a;
+
+    return result;
+}
+
  // ggml_arange
  
  struct ggml_tensor * ggml_arange(
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index 6398bda6a40ed97bcb0cff588ce0a4029bb42b4b..5263cfd583c0be192621672a647bd1709cf0dfe7 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -331,6 +331,14 @@ if (NOT GGML_BACKEND_DL)
      target_link_libraries(${TEST_TARGET} PRIVATE ggml)
      add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
  
+    #
+    # test-roll
+
+    set(TEST_TARGET test-roll)
+    add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+
      #
      # test-conv-transpose
  
diff --git a/tests/test-roll.cpp b/tests/test-roll.cpp

new file mode 100644 (file)

index 0000000..a798387
--- /dev/null
+++ b/tests/test-roll.cpp
@@ -0,0 +1,128 @@
+#include <ggml.h>
+#include <ggml-cpu.h>
+#include <ggml-alloc.h>
+#include <ggml-backend.h>
+#include <ggml-cpp.h>
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <array>
+#include <numeric>
+#include <vector>
+
+int64_t wrap(int64_t i, int64_t ne) {
+    if (i < 0) {
+        return i + ne;
+    } else if (i >= ne) {
+        return i - ne;
+    }
+    return i;
+}
+
+std::vector<float> roll_reference(
+    const float * src, std::array<int64_t, 4> ne, std::array<int64_t, 4> shift) {
+
+    const int64_t ne0 = ne[0], ne1 = ne[1], ne2 = ne[2], ne3 = ne[3];
+    std::vector<float> dst(ne0 * ne1 * ne2 * ne3);
+
+    for (int64_t i3 = 0; i3 < ne3; ++i3) {
+        for (int64_t i2 = 0; i2 < ne2; ++i2) {
+            for (int64_t i1 = 0; i1 < ne1; ++i1) {
+                for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                    const int64_t i03 = wrap(i3 - shift[3], ne3);
+                    const int64_t i02 = wrap(i2 - shift[2], ne2);
+                    const int64_t i01 = wrap(i1 - shift[1], ne1);
+                    const int64_t i00 = wrap(i0 - shift[0], ne0);
+
+                    dst[i3 * (ne2*ne1*ne0) + i2 * (ne1*ne0) + i1 * ne0 + i0] =
+                        src[i03 * (ne2*ne1*ne0) + i02 * (ne1*ne0) + i01 * ne0 + i00];
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+std::vector<float> f32_range(int n) {
+    std::vector<float> values(n);
+    std::iota(values.begin(), values.end(), 0.f);
+    return values;
+}
+
+bool check_equal(const std::vector<float> & result, const std::vector<float> & expected) {
+    if (result.size() != expected.size()) {
+        printf("result.size() = %d, expected.size() = %d\n", (int)result.size(), (int)expected.size());
+        return false;
+    }
+    for (int i = 0; i < result.size(); i++) {
+        if(std::abs(result[i] - expected[i]) > 1e-5) {
+            printf("result[%d] %f != %f expected[%d]\n", i, result[i], expected[i], i);
+            return false;
+        }
+    }
+    return true;
+}
+
+bool test_roll(std::array<int64_t, 4> ne, std::array<int64_t, 4> shift, bool permute) {
+    ggml_time_init();
+
+    ggml_init_params params {
+        /*.mem_size   =*/ 64 * ggml_tensor_overhead() + ggml_graph_overhead(),
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true
+    };
+
+    ggml_context_ptr ctx_ptr{ggml_init(params)};
+    ggml_context * ctx = ctx_ptr.get();
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Build graph
+    ggml_tensor * src = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+    ggml_tensor * res;
+    if (!permute) {
+        res = ggml_roll(ctx, src, shift[0], shift[1], shift[2], shift[3]);
+    } else {
+        ggml_tensor * p = ggml_permute(ctx, src, 0, 3, 1, 2);
+        res = ggml_roll(ctx, p, shift[0], shift[2], shift[3], shift[1]);
+        res = ggml_cont(ctx, ggml_permute(ctx, res, 0, 2, 3, 1));
+    }
+    ggml_build_forward_expand(gf, res);
+
+    // Create backend & allocate buffers
+    ggml_backend_ptr backend_ptr{ggml_backend_cpu_init()};
+    ggml_backend_t backend = backend_ptr.get();
+    ggml_backend_cpu_set_n_threads(backend, 2);
+    ggml_backend_buffer_ptr buffer{ggml_backend_alloc_ctx_tensors(ctx, backend)};
+
+    std::vector<float> src_values = f32_range(ggml_nelements(src));
+    ggml_backend_tensor_set(src, src_values.data(), 0, ggml_nbytes(src));
+
+    // Execute and compare results
+    ggml_backend_graph_compute(backend, gf);
+
+    std::vector<float> res_values(ggml_nelements(res));
+    ggml_backend_tensor_get(res, res_values.data(), 0, ggml_nbytes(res));
+
+    std::vector<float> expected = roll_reference(src_values.data(), ne, shift);
+
+    bool passed = check_equal(res_values, expected);
+
+    printf("ggml_roll(%d(%d), %d(%d), %d(%d), %d(%d), %s): %s\n",
+        int(ne[0]), int(shift[0]),
+        int(ne[1]), int(shift[1]),
+        int(ne[2]), int(shift[2]),
+        int(ne[3]), int(shift[3]),
+        permute ? "permuted" : "contiguous",
+        passed ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
+    return passed;
+}
+
+int main() {
+    bool passed = true;
+    passed &= test_roll({3, 7, 4, 2}, {1, 0, -1, 0}, false);
+    passed &= test_roll({37, 42, 59, 2}, {-4, 3, -7, 1}, false);
+    passed &= test_roll({37, 42, 59, 2}, {-4, 3, -7, 1}, true);
+    return passed ? 0 : 1;
+}
+\ No newline at end of file
author	Acly <redacted>
	Wed, 18 Jun 2025 11:34:50 +0000 (13:34 +0200)
committer	GitHub <redacted>
	Wed, 18 Jun 2025 11:34:50 +0000 (13:34 +0200)
include/ggml.h		patch \| blob \| history
src/ggml-cpu/ggml-cpu.c		patch \| blob \| history
src/ggml-cpu/ops.cpp		patch \| blob \| history
src/ggml-cpu/ops.h		patch \| blob \| history
src/ggml.c		patch \| blob \| history
tests/CMakeLists.txt		patch \| blob \| history
tests/test-roll.cpp	[new file with mode: 0644]	patch \| blob