whisper : remove ggml_mul_mat padding (#3436)

author Georgi Gerganov <redacted>

Mon, 29 Sep 2025 13:42:08 +0000 (16:42 +0300)

committer GitHub <redacted>

Mon, 29 Sep 2025 13:42:08 +0000 (16:42 +0300)
author Georgi Gerganov <redacted>
Mon, 29 Sep 2025 13:42:08 +0000 (16:42 +0300)
committer GitHub <redacted>
Mon, 29 Sep 2025 13:42:08 +0000 (16:42 +0300)
diff --git a/src/whisper.cpp b/src/whisper.cpp

index 52de68c2b128336b0e303c45caab4685c84d3aec..efc3192b47c3b913883bf572ab3f2825bef40c26 100644 (file)
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -252,45 +252,6 @@ static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, i
      *(int32_t *) data = v;
  }
  
-// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
-// the idea is to represent the original matrix multiplication:
-//
-//   Z = X @ Y
-//
-// with the sum of two matrix multiplications:
-//
-//   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
-//
-// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
-// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
-// general-purpose kernels
-//
-static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
-    // use padding only if dimension 0 is at least 8 times larger than the padding
-    // else we won't get much benefit from the optimization
-    const int n_pad_req = 8;
-
-    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
-        return ggml_mul_mat(ctx, x, y);
-    }
-
-    struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
-    struct ggml_tensor * x_1 = ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
-
-    struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
-    struct ggml_tensor * y_1 = ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
-
-    return ggml_add(ctx,
-            ggml_mul_mat(ctx, x_0, y_0),
-            ggml_mul_mat(ctx, x_1, y_1));
-}
-
-// TODO: check if other platforms can benefit from this optimization
-// TODO: CUDA is currently broken - seems ggml_mul_mat does not handle views correctly
-#if defined(GGML_USE_METAL)
-#define ggml_mul_mat ggml_mul_mat_pad
-#endif
-
  // available whisper models
  enum e_model {
      MODEL_UNKNOWN,
author	Georgi Gerganov <redacted>
	Mon, 29 Sep 2025 13:42:08 +0000 (16:42 +0300)
committer	GitHub <redacted>
	Mon, 29 Sep 2025 13:42:08 +0000 (16:42 +0300)