cmake --build build --config Release -j $(nproc)
```
-- By default, NNPA is disabled by default. To enable it:
-
- ```bash
- cmake -S . -B build \
- -DCMAKE_BUILD_TYPE=Release \
- -DGGML_BLAS=ON \
- -DGGML_BLAS_VENDOR=OpenBLAS \
- -DGGML_NNPA=ON
-
- cmake --build build --config Release -j $(nproc)
- ```
-
- For debug builds:
```bash
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
-### 2. NNPA Vector Intrinsics Acceleration
-
-Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
-
-### 3. zDNN Accelerator (WIP)
+### 2. zDNN Accelerator (WIP)
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
-### 4. Spyre Accelerator
+### 3. Spyre Accelerator
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
```
-5. `-DGGML_NNPA=ON` generates gibberish output
-
- Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
-
## Getting Help on IBM Z & LinuxONE
1. **Bugs, Feature Requests**
## Appendix B: SIMD Support Matrix
-| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
-| ---------- | ----------- | ---- | ---- | ----- |
-| FP32 | ✅ | ✅ | ✅ | ❓ |
-| FP16 | ✅ | ✅ | ❓ | ❓ |
-| BF16 | 🚫 | 🚫 | ❓ | ❓ |
-| Q4_0 | ✅ | ✅ | ❓ | ❓ |
-| Q4_1 | ✅ | ✅ | ❓ | ❓ |
-| MXFP4 | 🚫 | 🚫 | ❓ | ❓ |
-| Q5_0 | ✅ | ✅ | ❓ | ❓ |
-| Q5_1 | ✅ | ✅ | ❓ | ❓ |
-| Q8_0 | ✅ | ✅ | ❓ | ❓ |
-| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
-| Q3_K | ✅ | ✅ | ❓ | ❓ |
-| Q4_K | ✅ | ✅ | ❓ | ❓ |
-| Q5_K | ✅ | ✅ | ❓ | ❓ |
-| Q6_K | ✅ | ✅ | ❓ | ❓ |
-| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
-| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
-| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
-| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
-| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
-| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
-| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
-| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
-| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
-| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
-| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
-| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
-| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
+| | VX/VXE/VXE2 | zDNN | Spyre |
+|------------|-------------|------|-------|
+| FP32 | ✅ | ✅ | ❓ |
+| FP16 | ✅ | ❓ | ❓ |
+| BF16 | 🚫 | ❓ | ❓ |
+| Q4_0 | ✅ | ❓ | ❓ |
+| Q4_1 | ✅ | ❓ | ❓ |
+| MXFP4 | 🚫 | ❓ | ❓ |
+| Q5_0 | ✅ | ❓ | ❓ |
+| Q5_1 | ✅ | ❓ | ❓ |
+| Q8_0 | ✅ | ❓ | ❓ |
+| Q2_K | 🚫 | ❓ | ❓ |
+| Q3_K | ✅ | ❓ | ❓ |
+| Q4_K | ✅ | ❓ | ❓ |
+| Q5_K | ✅ | ❓ | ❓ |
+| Q6_K | ✅ | ❓ | ❓ |
+| TQ1_0 | 🚫 | ❓ | ❓ |
+| TQ2_0 | 🚫 | ❓ | ❓ |
+| IQ2_XXS | 🚫 | ❓ | ❓ |
+| IQ2_XS | 🚫 | ❓ | ❓ |
+| IQ2_S | 🚫 | ❓ | ❓ |
+| IQ3_XXS | 🚫 | ❓ | ❓ |
+| IQ3_S | 🚫 | ❓ | ❓ |
+| IQ1_S | 🚫 | ❓ | ❓ |
+| IQ1_M | 🚫 | ❓ | ❓ |
+| IQ4_NL | ✅ | ❓ | ❓ |
+| IQ4_XS | ✅ | ❓ | ❓ |
+| FP32->FP16 | 🚫 | ❓ | ❓ |
+| FP16->FP32 | 🚫 | ❓ | ❓ |
- ✅ - acceleration available
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 6, 2025.
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
-option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
- GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
- set(GGML_NNPA OFF)
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931")
list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif()
-
- if (GGML_NNPA)
- message(STATUS "NNPA enabled")
- list(APPEND ARCH_DEFINITIONS GGML_NNPA)
- endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected")
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
#endif // __VXE2__
#endif // __s390x__ && __VEC__
-#if defined(__s390x__) && defined(GGML_NNPA)
-#ifndef __NNPA__
-#define __NNPA__
-#endif // __NNPA__
-#endif // __s390x__ && GGML_NNPA
-
#if defined(__ARM_FEATURE_SVE)
#include <sys/prctl.h>
#endif
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
-#elif defined(__NNPA__)
- for (; i + 7 < n; i += 8) {
- float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
- float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
- uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
- vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
- }
- for (; i + 3 < n; i += 4) {
- float32x4_t v_x = vec_xl(0, (const float *)(x + i));
- float32x4_t v_zero = vec_splats(0.0f);
- uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
- vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
- }
#elif defined(__riscv_zvfh)
for (int vl; i < n; i += vl) {
vl = __riscv_vsetvl_e32m2(n - i);
__m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec);
}
-#elif defined(__NNPA__)
- for (; i + 7 < n; i += 8) {
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
- float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
- vec_xst(v_yh, 0, (float *)(y + i + 0));
- vec_xst(v_yl, 0, (float *)(y + i + 4));
- }
- for (; i + 3 < n; i += 4) {
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
- vec_xst(v_yh, 0, (float *)(y + i));
- }
#endif
for (; i < n; ++i) {
#endif
}
-int ggml_cpu_has_nnpa(void) {
-#if defined(GGML_NNPA)
- return 1;
-#else
- return 0;
-#endif
-}
-
int ggml_cpu_has_neon(void) {
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
return 1;
if (ggml_cpu_has_vxe()) {
features.push_back({ "VXE", "1" });
}
- if (ggml_cpu_has_nnpa()) {
- features.push_back({ "NNPA", "1" });
- }
if (ggml_cpu_has_wasm_simd()) {
features.push_back({ "WASM_SIMD", "1" });
}
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
-#elif defined(__NNPA__)
- #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
- #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
-
- #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
- #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
-
- static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
- uint16x8_t v_h = vec_splats(h);
- uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
- return vec_extend_to_fp32_hi(v_hd, 0)[0];
- }
-
- static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
- float32x4_t v_f = vec_splats(f);
- float32x4_t v_zero = vec_splats(0.0f);
- uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
- uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
- return vec_extract(v_h, 0);
- }
#endif
// precomputed f32 table for f16 (256 KB)
#define GGML_F16_EPR GGML_F32_EPR
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
-#if defined(__NNPA__)
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
- uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
- return vec_extend_to_fp32_hi(v_xd, 0);
-#else
float tmp[4];
for (int i = 0; i < 4; i++) {
// note: keep type-cast here to prevent compiler bugs
// see: https://github.com/ggml-org/llama.cpp/issues/12846
return vec_xl(0, (const float *)(tmp));
-#endif
}
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
-#if defined(__NNPA__)
- float32x4_t v_zero = vec_splats(0.0f);
- uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
- uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
-
- x[0] = vec_extract(v_x, 0);
- x[1] = vec_extract(v_x, 1);
- x[2] = vec_extract(v_x, 2);
- x[3] = vec_extract(v_x, 3);
-#else
float arr[4];
// note: keep type-cast here to prevent compiler bugs
for (int i = 0; i < 4; i++) {
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
}
-#endif
}
#define GGML_F16_VEC GGML_F32x4