ggml-cpu:add RISC-V RVV (Zvfh) optimization for FP16 vector scaling (llama/17314)

author ixgbe <redacted>

Thu, 20 Nov 2025 06:09:18 +0000 (14:09 +0800)

committer Georgi Gerganov <redacted>

Fri, 12 Dec 2025 15:53:04 +0000 (17:53 +0200)
author ixgbe <redacted>
Thu, 20 Nov 2025 06:09:18 +0000 (14:09 +0800)
committer Georgi Gerganov <redacted>
Fri, 12 Dec 2025 15:53:04 +0000 (17:53 +0200)
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h

index ac59f1fe85b25c94194d75b2ec2c2701e83057ec..e0e9540433d434d681eee8c3b5221cfd06986a03 100644 (file)
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -698,60 +698,61 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
  }
  
  inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
-#if defined(GGML_SIMD)
-    #if defined(__ARM_FEATURE_SVE)
-        const int sve_register_length = svcntb() * 8;
-        const int ggml_f16_epr = sve_register_length / 16;
-        const int ggml_f16_step = 2 * ggml_f16_epr;
-
-        GGML_F16x_VEC vx =  GGML_F16x_VEC_SET1(v);
-        const int np = (n & ~(ggml_f16_step - 1));
-        svfloat16_t ay1, ay2;
-
-        for (int i = 0; i < np; i += ggml_f16_step) {
-            ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
-            ay1 = GGML_F16x_VEC_MUL(ay1, vx);
-            GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
-
-            ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
-            ay2 = GGML_F16x_VEC_MUL(ay2, vx);
-            GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
-        }
-        // leftovers
-        // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
-        if (np < n) {
-            svbool_t pg = svwhilelt_b16(np, n);
-            svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
-            svfloat16_t out = svmul_f16_m(pg, hy, vx);
-            svst1_f16(pg, (__fp16 *)(y + np), out);
-        }
-    #elif defined(__riscv_v_intrinsic)
-        // todo: RVV impl
-        // scalar
-        for (int i = 0; i < n; ++i) {
-            y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
-        }
-    #else
-        const int np = (n & ~(GGML_F16_STEP - 1));
+#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
+    const int sve_register_length = svcntb() * 8;
+    const int ggml_f16_epr = sve_register_length / 16;
+    const int ggml_f16_step = 2 * ggml_f16_epr;
+
+    GGML_F16x_VEC vx =  GGML_F16x_VEC_SET1(v);
+    const int np = (n & ~(ggml_f16_step - 1));
+    svfloat16_t ay1, ay2;
+
+    for (int i = 0; i < np; i += ggml_f16_step) {
+        ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
+        ay1 = GGML_F16x_VEC_MUL(ay1, vx);
+        GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
+
+        ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
+        ay2 = GGML_F16x_VEC_MUL(ay2, vx);
+        GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
+    }
+    // leftovers
+    // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
+    if (np < n) {
+        svbool_t pg = svwhilelt_b16(np, n);
+        svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
+        svfloat16_t out = svmul_f16_m(pg, hy, vx);
+        svst1_f16(pg, (__fp16 *)(y + np), out);
+    }
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
+    for (int i = 0, vl; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
+        vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
+        vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
+        vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
+        __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
+    }
+#elif defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
  
-        GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
  
-        GGML_F16_VEC ay[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
  
-        for (int i = 0; i < np; i += GGML_F16_STEP) {
-            for (int j = 0; j < GGML_F16_ARR; j++) {
-                ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-                ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
  
-                GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
-            }
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
          }
+    }
  
-        // leftovers
-        for (int i = np; i < n; ++i) {
-            y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
-        }
-    #endif
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
+    }
  #else
      // scalar
      for (int i = 0; i < n; ++i) {
author	ixgbe <redacted>
	Thu, 20 Nov 2025 06:09:18 +0000 (14:09 +0800)
committer	Georgi Gerganov <redacted>
	Fri, 12 Dec 2025 15:53:04 +0000 (17:53 +0200)