ggml-cpu : disable GGML_NNPA by default due to instability (#14880)

author Aaron Teo <redacted>

Fri, 25 Jul 2025 17:09:03 +0000 (01:09 +0800)

committer GitHub <redacted>

Fri, 25 Jul 2025 17:09:03 +0000 (19:09 +0200)
author Aaron Teo <redacted>
Fri, 25 Jul 2025 17:09:03 +0000 (01:09 +0800)
committer GitHub <redacted>
Fri, 25 Jul 2025 17:09:03 +0000 (19:09 +0200)
diff --git a/docs/build-s390x.md b/docs/build-s390x.md

index 4c9ebb271cee248a1cfd54bc65887ad90764b3b6..4d5857753ae68b2254a3aef44a9233eeb5886a06 100644 (file)
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
      cmake --build build --config Release -j $(nproc)
      ```
  
--   By default, NNPA is enabled when available. To disable it (not recommended):
+-   By default, NNPA is disabled by default. To enable it:
  
      ```bash
      cmake -S . -B build             \
          -DCMAKE_BUILD_TYPE=Release  \
          -DGGML_BLAS=ON              \
          -DGGML_BLAS_VENDOR=OpenBLAS \
-        -DGGML_NNPA=OFF
+        -DGGML_NNPA=ON
  
      cmake --build build --config Release -j $(nproc)
      ```
@@ -84,9 +84,9 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
  
      ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
  
-    You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
+    You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
  
-    These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
+    These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
  
  2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
  
@@ -94,6 +94,14 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
  
      The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
  
+    Ensure that you have installed the required packages in advance
+
+    ```bash
+    pip3 install -r requirements.txt
+    ```
+
+    Convert the `safetensors` model to `GGUF`
+
      ```bash
      python3 convert_hf_to_gguf.py \
          --outfile model-name-be.f16.gguf \
@@ -116,7 +124,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
  
      ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
  
-    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
+    The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
  
      ```bash
      python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
@@ -141,15 +149,15 @@ Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by
  
  ### 2. NNPA Vector Intrinsics Acceleration
  
-Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
  
  ### 3. zDNN Accelerator
  
-_Only available in IBM z16 or later system. No direction at the moment._
+_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
  
  ### 4. Spyre Accelerator
  
-_No direction at the moment._
+_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
  
  ## Performance Tuning
  
@@ -189,6 +197,26 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
  
      Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
  
+4. Failing to install the `sentencepiece` package using GCC 15+
+
+    Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
+
+    As a temporary workaround, please run the installation command with the following environment variables.
+
+    ```bash
+    export CXXFLAGS="-include cstdint"
+    ```
+
+    For example,
+
+    ```bash
+    CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
+    ```
+
+5. `-DGGML_NNPA=ON` generates gibberish output
+
+    Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
+
  ## Getting Help on IBM Z & LinuxONE
  
  1. **Bugs, Feature Requests**
@@ -244,3 +272,5 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
  -   ✅ - acceleration available
  -   🚫 - acceleration unavailable, will still run using scalar implementation
  -   ❓ - acceleration unknown, please contribute if you can test it yourself
+
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt

index 8ca1053cab3205c02b9f0038fa4f941135eef251..20467c54da1020bedc8c5d6e685fd056eac757fe 100644 (file)
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -131,7 +131,7 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
  option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
  option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
  option(GGML_VXE              "ggml: enable vxe"              ON)
-option(GGML_NNPA             "ggml: enable nnpa"             ON)
+option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
  
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
  set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt

index 2cc42d4b02af95511144e5e220317a3e529fd844..f188d1638dc5ddb8d804c0f54a14bbee09a96859 100644 (file)
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
              list(APPEND ARCH_FLAGS -march=z16)
          elseif (${S390X_M} MATCHES "9175|9176")
              # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
+            #       binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
              message(STATUS "z17 target")
              list(APPEND ARCH_FLAGS -march=z17)
          else()
author	Aaron Teo <redacted>
	Fri, 25 Jul 2025 17:09:03 +0000 (01:09 +0800)
committer	GitHub <redacted>
	Fri, 25 Jul 2025 17:09:03 +0000 (19:09 +0200)
docs/build-s390x.md		patch \| blob \| history
ggml/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cpu/CMakeLists.txt		patch \| blob \| history