ggml-cpu: arm64: Q4_K scale unroll and vectorization (llama/19108)

author Alberto Cabrera Pérez <redacted>

Wed, 28 Jan 2026 07:15:56 +0000 (07:15 +0000)

committer Georgi Gerganov <redacted>

Fri, 30 Jan 2026 13:56:40 +0000 (15:56 +0200)
author Alberto Cabrera Pérez <redacted>
Wed, 28 Jan 2026 07:15:56 +0000 (07:15 +0000)
committer Georgi Gerganov <redacted>
Fri, 30 Jan 2026 13:56:40 +0000 (15:56 +0200)
diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp

index f40226494cd3ac2794214c3444b4ab3b65716f2c..99bb70274c56c92478cfdef2d987904009008ae0 100644 (file)
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -3148,16 +3148,17 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,
  
                          // Scales[i] corresponds to column i
                          const int scale_offset = cp * 2;
-                        for (int blk = 0; blk < 2; blk++) {
-                            const int32x4_t block_scale = {
-                                (int32_t) q4sb_scales[blk][scale_offset],
-                                (int32_t) q4sb_scales[blk][scale_offset],
-                                (int32_t) q4sb_scales[blk][scale_offset + 1],
-                                (int32_t) q4sb_scales[blk][scale_offset + 1],
-                            };
-                            acc[cp]     = vmlaq_s32(acc[cp], sb_acc[blk], block_scale);
-                            acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[blk + 2], block_scale);
-                        }
+                        const int32_t scale_00 = q4sb_scales[0][scale_offset];
+                        const int32_t scale_01 = q4sb_scales[0][scale_offset + 1];
+                        const int32_t scale_10 = q4sb_scales[1][scale_offset];
+                        const int32_t scale_11 = q4sb_scales[1][scale_offset + 1];
+                        const int32x4_t block_scale_0 = vcombine_s32(vdup_n_s32(scale_00), vdup_n_s32(scale_01));
+                        const int32x4_t block_scale_1 = vcombine_s32(vdup_n_s32(scale_10), vdup_n_s32(scale_11));
+
+                        acc[cp]     = vmlaq_s32(acc[cp], sb_acc[0], block_scale_0);
+                        acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[2], block_scale_0);
+                        acc[cp]     = vmlaq_s32(acc[cp], sb_acc[1], block_scale_1);
+                        acc[cp + 4] = vmlaq_s32(acc[cp + 4], sb_acc[3], block_scale_1);
                      }
  
                      // Multiply Acc bsum + mins
author	Alberto Cabrera Pérez <redacted>
	Wed, 28 Jan 2026 07:15:56 +0000 (07:15 +0000)
committer	Georgi Gerganov <redacted>
	Fri, 30 Jan 2026 13:56:40 +0000 (15:56 +0200)