ggml-cpu: drop support for nnpa intrinsics (llama/15821)

author Aaron Teo <redacted>

Sat, 6 Sep 2025 03:27:28 +0000 (11:27 +0800)

committer Georgi Gerganov <redacted>

Sat, 20 Sep 2025 10:33:50 +0000 (13:33 +0300)
author Aaron Teo <redacted>
Sat, 6 Sep 2025 03:27:28 +0000 (11:27 +0800)
committer Georgi Gerganov <redacted>
Sat, 20 Sep 2025 10:33:50 +0000 (13:33 +0300)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 08215bf930550c03f14763499e9cd571d08649e2..b113b68fae67dd1d48f0211a1485843493bf8ec4 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,7 +170,6 @@ option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
  option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
  option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
  option(GGML_VXE              "ggml: enable vxe"              ON)
-option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
  
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
  set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
diff --git a/include/ggml-cpu.h b/include/ggml-cpu.h

index be40b100979dee55265aa6e2a977f2ac354c59a6..1a78935aa05cf32a1025c7989fbe12089e860f69 100644 (file)
--- a/include/ggml-cpu.h
+++ b/include/ggml-cpu.h
@@ -101,7 +101,6 @@ extern "C" {
      GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
      GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
      GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
-    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
      GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
      GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
  
diff --git a/src/ggml-cpu/CMakeLists.txt b/src/ggml-cpu/CMakeLists.txt

index dd8c1cf67840ee43130f14100805462b8ee94b81..388675f5f0911820830cb8dd366bb3e5fe2d9020 100644 (file)
--- a/src/ggml-cpu/CMakeLists.txt
+++ b/src/ggml-cpu/CMakeLists.txt
@@ -457,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
  
          # TODO: Separation to determine activation of VX/VXE/VXE2
          if (${S390X_M} MATCHES "8561|8562")
-            set(GGML_NNPA OFF)
              message(STATUS "z15 target")
              list(APPEND ARCH_FLAGS -march=z15)
          elseif (${S390X_M} MATCHES "3931")
@@ -479,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
              list(APPEND ARCH_FLAGS -mvx -mzvector)
              list(APPEND ARCH_DEFINITIONS GGML_VXE)
          endif()
-
-        if (GGML_NNPA)
-            message(STATUS "NNPA enabled")
-            list(APPEND ARCH_DEFINITIONS GGML_NNPA)
-        endif()
      elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
          message(STATUS "Wasm detected")
          list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
diff --git a/src/ggml-cpu/ggml-cpu-impl.h b/src/ggml-cpu/ggml-cpu-impl.h

index e08c30a348aa1368a33e2b15b613fe4953d1fb3d..cd055e75cb57d964e5b0e687de89f6f6329d8148 100644 (file)
--- a/src/ggml-cpu/ggml-cpu-impl.h
+++ b/src/ggml-cpu/ggml-cpu-impl.h
@@ -68,12 +68,6 @@ struct ggml_compute_params {
  #endif  // __VXE2__
  #endif  // __s390x__ && __VEC__
  
-#if defined(__s390x__) && defined(GGML_NNPA)
-#ifndef __NNPA__
-#define __NNPA__
-#endif  // __NNPA__
-#endif  // __s390x__ && GGML_NNPA
-
  #if defined(__ARM_FEATURE_SVE)
  #include <sys/prctl.h>
  #endif
diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c

index 0d35d9333e3f5913179b6df8b97a643ee91aab37..09772e806188c0fc48db059ad0f52bfafd9d9838 100644 (file)
--- a/src/ggml-cpu/ggml-cpu.c
+++ b/src/ggml-cpu/ggml-cpu.c
@@ -3211,21 +3211,6 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
          __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
          _mm_storel_epi64((__m128i *)(y + i), y_vec);
      }
-#elif defined(__NNPA__)
-    for (; i + 7 < n; i += 8) {
-        float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
-        float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
-        uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
-        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
-        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
-    }
-    for (; i + 3 < n; i += 4) {
-        float32x4_t v_x = vec_xl(0, (const float *)(x + i));
-        float32x4_t v_zero = vec_splats(0.0f);
-        uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
-        uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
-        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
-    }
  #elif defined(__riscv_zvfh)
      for (int vl; i < n; i += vl) {
          vl = __riscv_vsetvl_e32m2(n - i);
@@ -3259,21 +3244,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
          __m128 y_vec = _mm_cvtph_ps(x_vec);
          _mm_storeu_ps(y + i, y_vec);
      }
-#elif defined(__NNPA__)
-    for (; i + 7 < n; i += 8) {
-        uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
-        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
-        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
-        float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
-        vec_xst(v_yh, 0, (float *)(y + i + 0));
-        vec_xst(v_yl, 0, (float *)(y + i + 4));
-    }
-    for (; i + 3 < n; i += 4) {
-        uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
-        uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
-        float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
-        vec_xst(v_yh, 0, (float *)(y + i));
-    }
  #endif
  
      for (; i < n; ++i) {
@@ -3477,14 +3447,6 @@ int ggml_cpu_has_vxe(void) {
  #endif
  }
  
-int ggml_cpu_has_nnpa(void) {
-#if defined(GGML_NNPA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
  int ggml_cpu_has_neon(void) {
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
      return 1;
diff --git a/src/ggml-cpu/ggml-cpu.cpp b/src/ggml-cpu/ggml-cpu.cpp

index 8dacd36714b4c01e0bce88ba573bfa980d8c9cc6..3fb46aaa4d8b53dac723a4ab93560f15b7840269 100644 (file)
--- a/src/ggml-cpu/ggml-cpu.cpp
+++ b/src/ggml-cpu/ggml-cpu.cpp
@@ -576,9 +576,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
          if (ggml_cpu_has_vxe()) {
              features.push_back({ "VXE", "1" });
          }
-        if (ggml_cpu_has_nnpa()) {
-            features.push_back({ "NNPA", "1" });
-        }
          if (ggml_cpu_has_wasm_simd()) {
              features.push_back({ "WASM_SIMD", "1" });
          }
diff --git a/src/ggml-cpu/simd-mappings.h b/src/ggml-cpu/simd-mappings.h

index 8bd56bdac1b43ea1482c034db66f5114619ff2bf..a84ba75c20ba15a4392a2c1b184b3848bbec8c47 100644 (file)
--- a/src/ggml-cpu/simd-mappings.h
+++ b/src/ggml-cpu/simd-mappings.h
@@ -114,26 +114,6 @@ extern "C" {
      #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
      #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
      #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
-#elif defined(__NNPA__)
-    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
-    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
-
-    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
-    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
-
-    static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
-        uint16x8_t v_h = vec_splats(h);
-        uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
-        return vec_extend_to_fp32_hi(v_hd, 0)[0];
-    }
-
-    static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
-        float32x4_t v_f = vec_splats(f);
-        float32x4_t v_zero = vec_splats(0.0f);
-        uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
-        uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
-        return vec_extract(v_h, 0);
-    }
  #endif
  
  // precomputed f32 table for f16 (256 KB)
@@ -1156,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
  #define GGML_F16_EPR  GGML_F32_EPR
  
  static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
-#if defined(__NNPA__)
-    uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
-    uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
-    return vec_extend_to_fp32_hi(v_xd, 0);
-#else
      float tmp[4];
  
      for (int i = 0; i < 4; i++) {
@@ -1170,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
      // note: keep type-cast here to prevent compiler bugs
      // see: https://github.com/ggml-org/llama.cpp/issues/12846
      return vec_xl(0, (const float *)(tmp));
-#endif
  }
  
  static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
-#if defined(__NNPA__)
-    float32x4_t v_zero = vec_splats(0.0f);
-    uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
-    uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
-
-    x[0] = vec_extract(v_x, 0);
-    x[1] = vec_extract(v_x, 1);
-    x[2] = vec_extract(v_x, 2);
-    x[3] = vec_extract(v_x, 3);
-#else
      float arr[4];
  
      // note: keep type-cast here to prevent compiler bugs
@@ -1193,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
      for (int i = 0; i < 4; i++) {
          x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
      }
-#endif
  }
  
  #define GGML_F16_VEC                GGML_F32x4
author	Aaron Teo <redacted>
	Sat, 6 Sep 2025 03:27:28 +0000 (11:27 +0800)
committer	Georgi Gerganov <redacted>
	Sat, 20 Sep 2025 10:33:50 +0000 (13:33 +0300)
CMakeLists.txt		patch \| blob \| history
include/ggml-cpu.h		patch \| blob \| history
src/ggml-cpu/CMakeLists.txt		patch \| blob \| history
src/ggml-cpu/ggml-cpu-impl.h		patch \| blob \| history
src/ggml-cpu/ggml-cpu.c		patch \| blob \| history
src/ggml-cpu/ggml-cpu.cpp		patch \| blob \| history
src/ggml-cpu/simd-mappings.h		patch \| blob \| history