ggml: Correct SVE implementation in ggml_vec_dot_f16_unroll (#16518)

author sirus20x6 <redacted>

Sun, 12 Oct 2025 05:15:00 +0000 (00:15 -0500)

committer GitHub <redacted>

Sun, 12 Oct 2025 05:15:00 +0000 (08:15 +0300)
author sirus20x6 <redacted>
Sun, 12 Oct 2025 05:15:00 +0000 (00:15 -0500)
committer GitHub <redacted>
Sun, 12 Oct 2025 05:15:00 +0000 (08:15 +0300)
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h

index 2751359ce49f456082239dfeed8cfe1c8f6e922b..d3834182a603c7e0c6aff2c3ab30f44eb5781a5e 100644 (file)
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -144,14 +144,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
          for (int i = 0; i < np; i += ggml_f16_step) {
              ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
  
-            ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
+            ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
              sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1);     // sum_00 = sum_00+ax1*ay1
              ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
              sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
  
              ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
  
-            ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
+            ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
              sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
              ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
              sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
@@ -160,7 +160,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
  
              ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
              sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
-            ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
+            ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
              sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
  
              ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
author	sirus20x6 <redacted>
	Sun, 12 Oct 2025 05:15:00 +0000 (00:15 -0500)
committer	GitHub <redacted>
	Sun, 12 Oct 2025 05:15:00 +0000 (08:15 +0300)