id: update_presets
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
- cp docs/backend/hexagon/CMakeUserPresets.json .
+ cp docs/backend/snapdragon/CMakeUserPresets.json .
- name: Build
id: ndk_build
+++ /dev/null
-{
- "version": 4,
- "configurePresets": [
- {
- "name": "arm64-android-snapdragon",
- "hidden": true,
- "architecture": { "value": "arm64", "strategy": "external" },
- "toolset": { "value": "host=x86_64", "strategy": "external" },
- "cacheVariables": {
- "ANDROID_ABI": "arm64-v8a",
- "ANDROID_PLATFORM": "android-31",
- "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
- "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
- "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
- "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
- "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
- "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
- "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
- "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
- "PREBUILT_LIB_DIR": "android_aarch64",
- "GGML_OPENMP": "OFF",
- "GGML_LLAMAFILE": "OFF",
- "GGML_OPENCL": "ON",
- "GGML_HEXAGON": "ON",
- "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
- "LLAMA_OPENSSL": "OFF"
- }
- },
-
- {
- "name": "arm64-windows-snapdragon",
- "inherits": [ "base", "arm64-windows-llvm" ],
- "cacheVariables": {
- "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
- "PREBUILT_LIB_DIR": "windows_aarch64",
- "GGML_OPENMP": "OFF",
- "GGML_LLAMAFILE": "OFF",
- "GGML_OPENCL": "ON",
- "GGML_HEXAGON": "ON",
- "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
- "LLAMA_OPENSSL": "OFF"
- }
- },
-
- { "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
- { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
-
- { "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
- { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
- ]
-}
+++ /dev/null
-# Snapdragon-based Android devices
-
-## How to Build
-
-The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
-This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
-
-This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
-
-```
-~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
-[d]/> cd /workspace
-```
-
-The rest of the Android build process assumes that you're running inside the toolchain container.
-Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
-
-```
-[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
-
-[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
-Preset CMake variables:
- ANDROID_ABI="arm64-v8a"
- ...
- CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
- GGML_HEXAGON="ON"
- GGML_OPENCL="ON"
- GGML_OPENMP="OFF"
- HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
-...
--- Including OpenCL backend
--- Including Hexagon backend
-...
--- Build files have been written to: /workspace/build-snapdragon
-
-[d]/workspace> cmake --build build-snapdragon
-...
-[144/356] Performing build step for 'htp-v73'
-[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
-[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
-[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
-[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
-...
--- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
--- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
-...
-```
-
-To generate an installable "package" simply use cmake --install:
-
-```
-[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
--- Install configuration: "Release"
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
-...
--- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
--- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
-...
-```
-
-## How to Install
-
-For this step, your device needs to be configured for on-device development.
-Please see https://developer.android.com/studio/debug/dev-options for details.
-
-Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
-**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
-
-```
-~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
-pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
-pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
-pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
-102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
-```
-
-At this point, you should also install some models:
-
-```
-~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
-...
-2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
-
-~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
-Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
-```
-
-## How to Run
-
-The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
-
-llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
-You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
-
-Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
-
-Here are some examples of running various llama.cpp tools via ADB.
-
-Simple question for Llama-3.2-1B
-
-```
-~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
-...
-ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
-ggml-hex: Hexagon Arch version v79
-ggml-hex: allocating new session: HTP0
-ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
-...
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 17/17 layers to GPU
-load_tensors: CPU model buffer size = 225.49 MiB
-load_tensors: HTP0 model buffer size = 0.26 MiB
-load_tensors: HTP0-REPACK model buffer size = 504.00 MiB
-...
-I hope this helps you understand the world's most popular cookies! [end of text]
-...
-llama_perf_sampler_print: sampling time = 30.08 ms / 487 runs ( 0.06 ms per token, 16191.77 tokens per second)
-llama_perf_context_print: load time = 617.94 ms
-llama_perf_context_print: prompt eval time = 80.76 ms / 11 tokens ( 7.34 ms per token, 136.21 tokens per second)
-llama_perf_context_print: eval time = 9210.59 ms / 475 runs ( 19.39 ms per token, 51.57 tokens per second)
-llama_perf_context_print: total time = 9454.92 ms / 486 tokens
-llama_perf_context_print: graphs reused = 473
-llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
-llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - Host | 439 = 225 + 136 + 77 |
-llama_memory_breakdown_print: | - HTP0-REPACK | 504 = 504 + 0 + 0 |
-```
-
-Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
-
-```
-~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
-...
-ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
-ggml-hex: Hexagon Arch version v81
-ggml-hex: allocating new session: HTP0
-ggml-hex: allocating new session: HTP1
-...
-load_tensors: offloading output layer to GPU
-load_tensors: offloaded 17/17 layers to GPU
-load_tensors: CPU model buffer size = 143.86 MiB
-load_tensors: HTP1 model buffer size = 0.23 MiB
-load_tensors: HTP1-REPACK model buffer size = 1575.00 MiB
-load_tensors: HTP0 model buffer size = 0.28 MiB
-load_tensors: HTP0-REPACK model buffer size = 2025.00 MiB
-...
-llama_context: CPU output buffer size = 0.19 MiB
-llama_kv_cache: HTP1 KV buffer size = 238.00 MiB
-llama_kv_cache: HTP0 KV buffer size = 306.00 MiB
-llama_kv_cache: size = 544.00 MiB ( 8192 cells, 16 layers, 1/1 seqs), K (q8_0): 272.00 MiB, V (q8_0): 272.00 MiB
-llama_context: HTP0 compute buffer size = 15.00 MiB
-llama_context: HTP1 compute buffer size = 15.00 MiB
-llama_context: CPU compute buffer size = 24.56 MiB
-...
-llama_perf_context_print: prompt eval time = 1730.57 ms / 212 tokens ( 8.16 ms per token, 122.50 tokens per second)
-llama_perf_context_print: eval time = 5624.75 ms / 257 runs ( 21.89 ms per token, 45.69 tokens per second)
-llama_perf_context_print: total time = 7377.33 ms / 469 tokens
-llama_perf_context_print: graphs reused = 255
-llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
-llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - Host | 742 = 144 + 544 + 54 |
-llama_memory_breakdown_print: | - HTP1-REPACK | 1575 = 1575 + 0 + 0 |
-llama_memory_breakdown_print: | - HTP0-REPACK | 2025 = 2025 + 0 + 0 |
-```
-
-Op test for MUL_MAT
-
-```
-~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
-...
-Backend 2/3: HTP0
-Device description: Hexagon
-Device memory: 2048 MB (2048 MB free)
-MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
-MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
-MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
-
-~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
-...
-ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
-ggml-hex: Hexagon Arch version v79
-ggml-hex: allocating new session: HTP0
-ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
-| model | size | params | backend | ngl | threads | n_batch | mmap | test | t/s |
-| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
-| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | pp128 | 169.42 ± 1.75 |
-| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | tg64 | 51.54 ± 1.13 |
-
-build: 6a8cf8914 (6733)
-```
-
-## Environment variables
-
-- `GGML_HEXAGON_NDEV=1`
- Controls the number of devices/sessions to allocate. The default is 1.
- Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
-
-- `GGML_HEXAGON_NHVX=0`
- Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
-
-- `GGML_HEXAGON_HOSTBUF=1`
- Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
- This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
-
-- `GGML_HEXAGON_EXPERIMENTAL=1`
- Controls whether the Hexagon backend enables experimental features.
- This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT).
-
-- `GGML_HEXAGON_VERBOSE=1`
- Enables verbose logging of Ops from the backend. Example output:
-
- ```
- ggml-hex: HTP0 graph-compute n_nodes 2
- ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
- ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
- ggml-hex: HTP0 graph-compute n_nodes 1
- ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
- ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
- ```
-
-- `GGML_HEXAGON_PROFILE=1`
- Generates a host-side profile for the ggml-hexagon Ops.
-
-- `GGML_HEXAGON_OPMASK=0x0`
- Allows enabling specific stages of the processing pipeline:
-
- - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- - `0x2` Enable Dynamic Quantizer (if needed for the Op)
- - `0x4` Enable Op Compute (MUL_MAT, etc.)
-
- Examples:
-
- `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
- `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
- `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)
+++ /dev/null
-# Hexagon backend developer details
-
-## Backend libraries
-
-The Hexagon backend consist of two parts:
-
- - `libggml-hexagon`
- This is the regular CPU-side GGML backend library, either shared or statically linked
-
- - `libggml-htp-vNN`
- This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
- The correct library is selected automatically at runtime based on the HW version.
-
-Here is an example of the build artifacts
-
-```
-~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
-pkg-adb/llama.cpp/lib/libggml-base.so
-pkg-adb/llama.cpp/lib/libggml-cpu.so
-pkg-adb/llama.cpp/lib/libggml-hexagon.so <<< CPU library
-pkg-adb/llama.cpp/lib/libggml-htp-v73.so <<< HTP op/kernels for Hexagon v73
-pkg-adb/llama.cpp/lib/libggml-htp-v75.so
-pkg-adb/llama.cpp/lib/libggml-htp-v79.so
-pkg-adb/llama.cpp/lib/libggml-htp-v81.so
-```
-
-## Memory buffers
-
-Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
-The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
-quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
-
-Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
-to the NPU at this point.
-
-The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
-From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
-the repacking.
-
-## Large model handling
-
-Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
-In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
-
-In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
-For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
-Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
-
-Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
-
-```
-M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
-...
-LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
-ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
-GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
- -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
-...
-llama_model_loader: - type f32: 289 tensors
-llama_model_loader: - type q4_0: 96 tensors
-llama_model_loader: - type q8_0: 2 tensors
-llama_model_loader: - type mxfp4: 72 tensors
-...
-load_tensors: offloaded 25/25 layers to GPU
-load_tensors: CPU model buffer size = 1182.09 MiB
-load_tensors: HTP1 model buffer size = 6.64 MiB
-load_tensors: HTP1-REPACK model buffer size = 2505.94 MiB
-load_tensors: HTP3 model buffer size = 5.55 MiB
-load_tensors: HTP3-REPACK model buffer size = 2088.28 MiB
-load_tensors: HTP0 model buffer size = 7.75 MiB
-load_tensors: HTP0-REPACK model buffer size = 2923.59 MiB
-load_tensors: HTP2 model buffer size = 6.64 MiB
-load_tensors: HTP2-REPACK model buffer size = 2505.94 MiB
-...
-llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
-llama_context: CPU output buffer size = 0.77 MiB
-llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
-llama_kv_cache: HTP1 KV buffer size = 25.50 MiB
-llama_kv_cache: HTP3 KV buffer size = 25.50 MiB
-llama_kv_cache: HTP0 KV buffer size = 25.50 MiB
-llama_kv_cache: HTP2 KV buffer size = 25.50 MiB
-llama_kv_cache: size = 102.00 MiB ( 8192 cells, 12 layers, 1/1 seqs), K (q8_0): 51.00 MiB, V (q8_0): 51.00 MiB
-llama_kv_cache_iswa: creating SWA KV cache, size = 256 cells
-llama_kv_cache: HTP1 KV buffer size = 0.80 MiB
-llama_kv_cache: HTP3 KV buffer size = 0.53 MiB
-llama_kv_cache: HTP0 KV buffer size = 1.06 MiB
-llama_kv_cache: HTP2 KV buffer size = 0.80 MiB
-llama_kv_cache: size = 3.19 MiB ( 256 cells, 12 layers, 1/1 seqs), K (q8_0): 1.59 MiB, V (q8_0): 1.59 MiB
-llama_context: HTP0 compute buffer size = 16.06 MiB
-llama_context: HTP1 compute buffer size = 16.06 MiB
-llama_context: HTP2 compute buffer size = 16.06 MiB
-llama_context: HTP3 compute buffer size = 16.06 MiB
-llama_context: CPU compute buffer size = 98.19 MiB
-...
-llama_perf_context_print: prompt eval time = 3843.67 ms / 197 tokens ( 19.51 ms per token, 51.25 tokens per second)
-llama_perf_context_print: eval time = 1686.13 ms / 31 runs ( 54.39 ms per token, 18.39 tokens per second)
-llama_perf_context_print: total time = 6266.30 ms / 228 tokens
-llama_perf_context_print: graphs reused = 30
-llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
-llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - HTP2 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - HTP3 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
-llama_memory_breakdown_print: | - Host | 1476 = 1208 + 105 + 162 |
-llama_memory_breakdown_print: | - HTP1-REPACK | 2505 = 2505 + 0 + 0 |
-llama_memory_breakdown_print: | - HTP3-REPACK | 2088 = 2088 + 0 + 0 |
-llama_memory_breakdown_print: | - HTP0-REPACK | 2923 = 2923 + 0 + 0 |
-llama_memory_breakdown_print: | - HTP2-REPACK | 2505 = 2505 + 0 + 0 |
-```
--- /dev/null
+{
+ "version": 5,
+ "cmakeMinimumRequired": {
+ "major": 3,
+ "minor": 28,
+ "patch": 0
+ },
+ "configurePresets": [
+ {
+ "name": "arm64-android-snapdragon",
+ "hidden": true,
+ "architecture": { "value": "arm64", "strategy": "external" },
+ "toolset": { "value": "host=x86_64", "strategy": "external" },
+ "cacheVariables": {
+ "ANDROID_ABI": "arm64-v8a",
+ "ANDROID_PLATFORM": "android-31",
+ "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
+ "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+ "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+ "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
+ "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
+ "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+ "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+ "CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
+ "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+ "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
+ "PREBUILT_LIB_DIR": "android_aarch64",
+ "GGML_OPENMP": "OFF",
+ "GGML_LLAMAFILE": "OFF",
+ "GGML_OPENCL": "ON",
+ "GGML_HEXAGON": "ON",
+ "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
+ "LLAMA_OPENSSL": "OFF"
+ }
+ },
+
+ {
+ "name": "arm64-windows-snapdragon",
+ "inherits": [ "base", "arm64-windows-llvm" ],
+ "cacheVariables": {
+ "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+ "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+ "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
+ "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
+ "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+ "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+ "CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
+ "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+ "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
+ "PREBUILT_LIB_DIR": "windows_aarch64",
+ "GGML_OPENMP": "OFF",
+ "GGML_LLAMAFILE": "OFF",
+ "GGML_OPENCL": "ON",
+ "GGML_HEXAGON": "ON",
+ "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
+ "LLAMA_OPENSSL": "OFF"
+ }
+ },
+
+ { "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
+ { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
+
+ { "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
+ { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
+ ]
+}
--- /dev/null
+# Snapdragon-based devices
+
+## Setup
+
+### Android
+
+The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
+This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
+
+This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
+
+```
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
+[d]/> cd /workspace
+```
+
+Note: The rest of the **Android** build process assumes that you're running inside the toolchain container.
+
+### Windows On Snapdragon
+
+Native Windows 11 arm64 builds has the following tools dependencies:
+- MS Visual Studio 2026 (Community Edition or Pro)
+ - MSVC arm64 standard and runtime libraries
+ - UCRT and Driver Kit
+- LLVM core libraries and Clang compiler (winget)
+- CMake, Git, Python (winget)
+- Hexagon SDK Community Edition 6.4 or later (see windows.md)
+- OpenCL SDK 2.3 or later (see windows.md)
+
+Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
+Adapt below build commands accordingly.
+
+## How to Build
+
+Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
+
+```
+[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
+
+[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
+Preset CMake variables:
+ ANDROID_ABI="arm64-v8a"
+ ...
+ CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
+ GGML_HEXAGON="ON"
+ GGML_OPENCL="ON"
+ GGML_OPENMP="OFF"
+ HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
+...
+-- Including OpenCL backend
+-- Including Hexagon backend
+...
+-- Build files have been written to: /workspace/build-snapdragon
+
+[d]/workspace> cmake --build build-snapdragon
+...
+[144/356] Performing build step for 'htp-v73'
+[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
+[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
+[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
+[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
+...
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
+...
+```
+
+To generate an installable "package" simply use cmake --install:
+
+```
+[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
+-- Install configuration: "Release"
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so
+...
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli
+...
+```
+
+## How to Install
+
+### Android
+
+For this step, your device needs to be configured for on-device development.
+Please see https://developer.android.com/studio/debug/dev-options for details.
+
+Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
+**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
+
+```
+~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/
+pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
+```
+
+At this point, you should also install some models:
+
+```
+~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
+...
+2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
+
+~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
+Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
+```
+
+### Windows
+
+All artifacts are already installed in the `pkg-snapdragon` folder.
+To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
+
+## How to Run
+
+The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
+
+llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
+You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
+
+Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
+
+Here are some examples of running various llama.cpp tools via ADB.
+
+Simple question for Llama-3.2-1B
+
+```
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?"
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors: CPU model buffer size = 225.49 MiB
+load_tensors: HTP0 model buffer size = 0.26 MiB
+load_tensors: HTP0-REPACK model buffer size = 504.00 MiB
+...
+I hope this helps you understand the world's most popular cookies! [end of text]
+...
+llama_perf_sampler_print: sampling time = 30.08 ms / 487 runs ( 0.06 ms per token, 16191.77 tokens per second)
+llama_perf_context_print: load time = 617.94 ms
+llama_perf_context_print: prompt eval time = 80.76 ms / 11 tokens ( 7.34 ms per token, 136.21 tokens per second)
+llama_perf_context_print: eval time = 9210.59 ms / 475 runs ( 19.39 ms per token, 51.57 tokens per second)
+llama_perf_context_print: total time = 9454.92 ms / 486 tokens
+llama_perf_context_print: graphs reused = 473
+llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
+llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - Host | 439 = 225 + 136 + 77 |
+llama_memory_breakdown_print: | - HTP0-REPACK | 504 = 504 + 0 + 0 |
+```
+
+Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
+
+```
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v81
+ggml-hex: allocating new session: HTP0
+ggml-hex: allocating new session: HTP1
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors: CPU model buffer size = 143.86 MiB
+load_tensors: HTP1 model buffer size = 0.23 MiB
+load_tensors: HTP1-REPACK model buffer size = 1575.00 MiB
+load_tensors: HTP0 model buffer size = 0.28 MiB
+load_tensors: HTP0-REPACK model buffer size = 2025.00 MiB
+...
+llama_context: CPU output buffer size = 0.19 MiB
+llama_kv_cache: HTP1 KV buffer size = 238.00 MiB
+llama_kv_cache: HTP0 KV buffer size = 306.00 MiB
+llama_kv_cache: size = 544.00 MiB ( 8192 cells, 16 layers, 1/1 seqs), K (q8_0): 272.00 MiB, V (q8_0): 272.00 MiB
+llama_context: HTP0 compute buffer size = 15.00 MiB
+llama_context: HTP1 compute buffer size = 15.00 MiB
+llama_context: CPU compute buffer size = 24.56 MiB
+...
+llama_perf_context_print: prompt eval time = 1730.57 ms / 212 tokens ( 8.16 ms per token, 122.50 tokens per second)
+llama_perf_context_print: eval time = 5624.75 ms / 257 runs ( 21.89 ms per token, 45.69 tokens per second)
+llama_perf_context_print: total time = 7377.33 ms / 469 tokens
+llama_perf_context_print: graphs reused = 255
+llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
+llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - Host | 742 = 144 + 544 + 54 |
+llama_memory_breakdown_print: | - HTP1-REPACK | 1575 = 1575 + 0 + 0 |
+llama_memory_breakdown_print: | - HTP0-REPACK | 2025 = 2025 + 0 + 0 |
+```
+
+Op test for MUL_MAT
+
+```
+~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
+...
+Backend 2/3: HTP0
+Device description: Hexagon
+Device memory: 2048 MB (2048 MB free)
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+
+~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
+| model | size | params | backend | ngl | threads | n_batch | mmap | test | t/s |
+| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
+| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | pp128 | 169.42 ± 1.75 |
+| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | tg64 | 51.54 ± 1.13 |
+
+build: 6a8cf8914 (6733)
+```
+
+## Environment variables
+
+- `GGML_HEXAGON_NDEV=1`
+ Controls the number of devices/sessions to allocate. The default is 1.
+ Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
+
+- `GGML_HEXAGON_NHVX=0`
+ Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
+
+- `GGML_HEXAGON_HOSTBUF=1`
+ Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
+ This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
+
+- `GGML_HEXAGON_EXPERIMENTAL=1`
+ Controls whether the Hexagon backend enables experimental features.
+ This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT).
+
+- `GGML_HEXAGON_VERBOSE=1`
+ Enables verbose logging of Ops from the backend. Example output:
+
+ ```
+ ggml-hex: HTP0 graph-compute n_nodes 2
+ ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
+ ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
+ ggml-hex: HTP0 graph-compute n_nodes 1
+ ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
+ ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
+ ```
+
+- `GGML_HEXAGON_PROFILE=1`
+ Generates a host-side profile for the ggml-hexagon Ops.
+
+- `GGML_HEXAGON_OPMASK=0x0`
+ Allows enabling specific stages of the processing pipeline:
+
+ - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
+ - `0x2` Enable Dynamic Quantizer (if needed for the Op)
+ - `0x4` Enable Op Compute (MUL_MAT, etc.)
+
+ Examples:
+
+ `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
+ `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest
+ `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default)
--- /dev/null
+# Hexagon backend developer details
+
+## Backend libraries
+
+The Hexagon backend consist of two parts:
+
+ - `libggml-hexagon`
+ This is the regular CPU-side GGML backend library, either shared or statically linked
+
+ - `libggml-htp-vNN`
+ This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
+ The correct library is selected automatically at runtime based on the HW version.
+
+Here is an example of the build artifacts
+
+```
+~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
+pkg-adb/llama.cpp/lib/libggml-base.so
+pkg-adb/llama.cpp/lib/libggml-cpu.so
+pkg-adb/llama.cpp/lib/libggml-hexagon.so <<< CPU library
+pkg-adb/llama.cpp/lib/libggml-htp-v73.so <<< HTP op/kernels for Hexagon v73
+pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+pkg-adb/llama.cpp/lib/libggml-htp-v81.so
+```
+
+## Memory buffers
+
+Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
+The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
+quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
+
+Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
+to the NPU at this point.
+
+The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
+From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
+the repacking.
+
+## Large model handling
+
+Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
+In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
+
+In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
+For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
+Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
+
+Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
+
+```
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32
+...
+LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
+ -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
+...
+llama_model_loader: - type f32: 289 tensors
+llama_model_loader: - type q4_0: 96 tensors
+llama_model_loader: - type q8_0: 2 tensors
+llama_model_loader: - type mxfp4: 72 tensors
+...
+load_tensors: offloaded 25/25 layers to GPU
+load_tensors: CPU model buffer size = 1182.09 MiB
+load_tensors: HTP1 model buffer size = 6.64 MiB
+load_tensors: HTP1-REPACK model buffer size = 2505.94 MiB
+load_tensors: HTP3 model buffer size = 5.55 MiB
+load_tensors: HTP3-REPACK model buffer size = 2088.28 MiB
+load_tensors: HTP0 model buffer size = 7.75 MiB
+load_tensors: HTP0-REPACK model buffer size = 2923.59 MiB
+load_tensors: HTP2 model buffer size = 6.64 MiB
+load_tensors: HTP2-REPACK model buffer size = 2505.94 MiB
+...
+llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
+llama_context: CPU output buffer size = 0.77 MiB
+llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
+llama_kv_cache: HTP1 KV buffer size = 25.50 MiB
+llama_kv_cache: HTP3 KV buffer size = 25.50 MiB
+llama_kv_cache: HTP0 KV buffer size = 25.50 MiB
+llama_kv_cache: HTP2 KV buffer size = 25.50 MiB
+llama_kv_cache: size = 102.00 MiB ( 8192 cells, 12 layers, 1/1 seqs), K (q8_0): 51.00 MiB, V (q8_0): 51.00 MiB
+llama_kv_cache_iswa: creating SWA KV cache, size = 256 cells
+llama_kv_cache: HTP1 KV buffer size = 0.80 MiB
+llama_kv_cache: HTP3 KV buffer size = 0.53 MiB
+llama_kv_cache: HTP0 KV buffer size = 1.06 MiB
+llama_kv_cache: HTP2 KV buffer size = 0.80 MiB
+llama_kv_cache: size = 3.19 MiB ( 256 cells, 12 layers, 1/1 seqs), K (q8_0): 1.59 MiB, V (q8_0): 1.59 MiB
+llama_context: HTP0 compute buffer size = 16.06 MiB
+llama_context: HTP1 compute buffer size = 16.06 MiB
+llama_context: HTP2 compute buffer size = 16.06 MiB
+llama_context: HTP3 compute buffer size = 16.06 MiB
+llama_context: CPU compute buffer size = 98.19 MiB
+...
+llama_perf_context_print: prompt eval time = 3843.67 ms / 197 tokens ( 19.51 ms per token, 51.25 tokens per second)
+llama_perf_context_print: eval time = 1686.13 ms / 31 runs ( 54.39 ms per token, 18.39 tokens per second)
+llama_perf_context_print: total time = 6266.30 ms / 228 tokens
+llama_perf_context_print: graphs reused = 30
+llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
+llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - HTP2 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - HTP3 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 |
+llama_memory_breakdown_print: | - Host | 1476 = 1208 + 105 + 162 |
+llama_memory_breakdown_print: | - HTP1-REPACK | 2505 = 2505 + 0 + 0 |
+llama_memory_breakdown_print: | - HTP3-REPACK | 2088 = 2088 + 0 + 0 |
+llama_memory_breakdown_print: | - HTP0-REPACK | 2923 = 2923 + 0 + 0 |
+llama_memory_breakdown_print: | - HTP2-REPACK | 2505 = 2505 + 0 + 0 |
+```
--- /dev/null
+## Overview
+
+The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs.
+
+
+In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so)
+must be included in the .cat file digitally signed with a trusted certificate.
+
+This document covers details on how to generate personal certificate files (.pfx) and how to configure the system
+to allow for test signatures (aka test-signing).
+
+## Install the latest Adreno OpenCL SDK
+
+Either use the trimmed down version (optimized for CI) from
+
+ https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz
+
+Or download the complete official version from
+
+ https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2
+
+Unzip/untar the archive into
+```
+c:\Qualcomm\OpenCL_SDK\2.3.2
+```
+
+## Install the latest Hexagon SDK Community Edition
+
+Either use the trimmed down version (optimized for CI) from
+
+ https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
+
+Or download the complete official version from
+
+ https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
+
+Unzip/untar the archive into
+```
+c:\Qualcomm\Hexagon_SDK\6.4.0.2
+```
+
+## Install the latest Adreno GPU driver
+
+Download the driver from
+
+ https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver
+
+After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`)
+
+## Install the latest Qualcomm NPU driver
+
+Download the driver from
+
+ https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND
+
+After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`).
+
+If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually.
+The components are extracted into
+```
+c:\QCDrivers\qcnspmcdm...
+```
+
+## Enable NPU driver test signatures
+
+Please note that the following steps are required only for the Hexagon NPU.
+Adreno GPU backend does not require test signatures.
+
+### Enable testsigning
+
+Use `bcdedit` to enable test-signing
+```
+> bcdedit /set TESTSIGNING ON
+```
+(Secure Boot may need to be disabled for this to work)
+
+Make sure test-signing is enabled after reboot
+```
+> bcdedit /enum
+...
+testsigning Yes
+...
+```
+For additional details see Microsoft guide at
+
+ https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option
+
+### Create personal certificate
+
+The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be
+installed as part of the MS Visual Studio.
+They are typically located at
+```
+c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0
+```
+(replace 10.0.26100.0 with correct version).
+
+To create personal self-signed certificate run the following commands (either from cmd or power-shell):
+```
+> cd c:\Users\MyUser
+> mkdir Certs
+> cd Certs
+> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer
+> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx
+```
+(replace `MyUser` with your username).
+
+Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores.
+This can be done using `certlm` Certificate Manager tool.
+Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the
+PFX file you created above.
+
+For additional details see Microsoft guide at
+
+ https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing
+
+Make sure to save the PFX file, you will need it for the build procedures.
+Please note that the same certificate can be used for signing any number of builds.
+
+## Build Hexagon backend with signed HTP ops libraries
+
+The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms.
+However, additional settings are required for generating and signing HTP Ops libraries.
+```
+> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
+> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
+> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
+> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
+> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
+
+> cmake --preset arm64-windows-snapdragon -B build-wos
+...
+> cmake --install build-wos --prefix pkg-snapdragon
+```
+
+Once the build is complete HTP ops libraries will be installed like this
+```
+> dir pkg-snapdragon/lib
+...
+-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v73.so
+-a---- 1/22/2026 6:01 PM 191752 libggml-htp-v75.so
+-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v79.so
+-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v81.so
+-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat
+```
+
+The .cat file, the signature and proper certicate installation can be verified with
+
+```
+> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
+Verifying: .\pkg-snapdragon\lib\libggml-htp.cat
+
+Signature Index: 0 (Primary Signature)
+Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF
+
+Signing Certificate Chain:
+ Issued to: GGML.HTP.v1
+...
+Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat
+...
+```
endif()
add_library(ggml
+ ggml-backend-dl.cpp
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)
--- /dev/null
+#include "ggml-backend-dl.h"
+
+#ifdef _WIN32
+
+dl_handle * dl_load_library(const fs::path & path) {
+ // suppress error dialogs for missing DLLs
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+ HMODULE handle = LoadLibraryW(path.wstring().c_str());
+
+ SetErrorMode(old_mode);
+
+ return handle;
+}
+
+void * dl_get_sym(dl_handle * handle, const char * name) {
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+ void * p = (void *) GetProcAddress(handle, name);
+
+ SetErrorMode(old_mode);
+
+ return p;
+}
+
+const char * dl_error() {
+ return "";
+}
+
+#else
+
+dl_handle * dl_load_library(const fs::path & path) {
+ dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+ return handle;
+}
+
+void * dl_get_sym(dl_handle * handle, const char * name) {
+ return dlsym(handle, name);
+}
+
+const char * dl_error() {
+ const char *rslt = dlerror();
+ return rslt != nullptr ? rslt : "";
+}
+
+#endif
--- /dev/null
+#pragma once
+
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+# include <windows.h>
+# include <winevt.h>
+#else
+# include <dlfcn.h>
+# include <unistd.h>
+#endif
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+ void operator()(HMODULE handle) {
+ FreeLibrary(handle);
+ }
+};
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+ void operator()(void * handle) {
+ dlclose(handle);
+ }
+};
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
+dl_handle * dl_load_library(const fs::path & path);
+void * dl_get_sym(dl_handle * handle, const char * name);
+const char * dl_error();
+
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
+#include "ggml-backend-dl.h"
#include "ggml-impl.h"
#include <algorithm>
#include <cstring>
}
}
-#ifdef _WIN32
-
-using dl_handle = std::remove_pointer_t<HMODULE>;
-
-struct dl_handle_deleter {
- void operator()(HMODULE handle) {
- FreeLibrary(handle);
- }
-};
-
-static dl_handle * dl_load_library(const fs::path & path) {
- // suppress error dialogs for missing DLLs
- DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
- SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
- HMODULE handle = LoadLibraryW(path.wstring().c_str());
-
- SetErrorMode(old_mode);
-
- return handle;
-}
-
-static void * dl_get_sym(dl_handle * handle, const char * name) {
- DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
- SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
- void * p = (void *) GetProcAddress(handle, name);
-
- SetErrorMode(old_mode);
-
- return p;
-}
-
-static const char * dl_error() {
- return "";
-}
-
-#else
-
-using dl_handle = void;
-
-struct dl_handle_deleter {
- void operator()(void * handle) {
- dlclose(handle);
- }
-};
-
-static void * dl_load_library(const fs::path & path) {
- dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
-
- return handle;
-}
-
-static void * dl_get_sym(dl_handle * handle, const char * name) {
- return dlsym(handle, name);
-}
-
-static const char * dl_error() {
- const char *rslt = dlerror();
- return rslt != nullptr ? rslt : "";
-}
-
-#endif
-
-using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
-
struct ggml_backend_reg_entry {
ggml_backend_reg_t reg;
dl_handle_ptr handle;
+file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
+file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
+
+if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}" OR NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
+ message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT and HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
+endif()
+
+message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels")
+
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
include(ExternalProject)
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
+set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
add_library(htp_iface OBJECT
target_link_options(htp_iface PUBLIC -ldl)
endif()
-link_custom_library(htp_iface cdsprpc)
-link_custom_library(htp_iface rpcmem)
-
set(TARGET_NAME ggml-hexagon)
ggml_add_backend_library(${TARGET_NAME}
- ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
+ ggml-hexagon.cpp
+ htp-drv.cpp
+ htp-drv.h
+ libdl.h
+ ../../include/ggml-hexagon.h)
target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
-# Build HTP bits
-set(HTP_CMAKE_ARGS
- -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
- -DCMAKE_BUILD_TYPE=Release
- -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
- -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
- -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
- -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
- -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
-
-ExternalProject_Add(htp-v68
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
- CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68")
-
-ExternalProject_Add(htp-v69
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
- CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69")
-
-ExternalProject_Add(htp-v73
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
- CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
-
-ExternalProject_Add(htp-v75
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
- CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
-
-ExternalProject_Add(htp-v79
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
- CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
-
-ExternalProject_Add(htp-v81
- SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
- CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
+# Build HTP skels
+set(HTP_SKELS)
+function(build_htp_skel V)
+ ExternalProject_Add(htp-${V}
+ SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
+ BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so
+ CMAKE_ARGS
+ -DCMAKE_BUILD_TYPE=Release
+ -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
+ -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
+ -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
+ -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
+ -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
+ -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
+ -DDSP_VERSION=${V}
+ -DPREBUILT_LIB_DIR="toolv19_${V}")
+ list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
+ set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
+endfunction()
+
+build_htp_skel(v68)
+build_htp_skel(v69)
+build_htp_skel(v73)
+build_htp_skel(v75)
+build_htp_skel(v79)
+build_htp_skel(v81)
# Install Hexagon skels required at runtime
-install(FILES
- ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so
- ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so
- ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
- ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
- ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
- ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
- TYPE LIB)
+install(FILES ${HTP_SKELS} TYPE LIB)
+
+if (CMAKE_SYSTEM_NAME MATCHES Windows AND GGML_HEXAGON_HTP_CERT)
+ file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/arm64" WINSDK_BIN0_ARM64)
+ file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/x86" WINSDK_BIN0_X86)
+ file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/arm64" WINSDK_BIN1_ARM64)
+ file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/x86" WINSDK_BIN1_X86)
+
+ set(WINSDK_PATHS ${WINSDK_BIN0_ARM64} ${WINSDK_BIN0_X86} ${WINSDK_BIN1_ARM64} ${WINSDK_BIN1_X86})
+
+ find_program(INF2CAT NAMES inf2cat.exe PATHS ${WINSDK_PATHS} REQUIRED)
+ find_program(SIGNTOOL NAMES signtool.exe PATHS ${WINSDK_PATHS} REQUIRED)
+
+ message(STATUS "hexagon: using ${GGML_HEXAGON_HTP_CERT} to sign libggml-htp skels")
+
+ set(LIBGGML_HTP_CAT ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp.cat)
+ add_custom_target(libggml-htp-cat
+ BYPRODUCTS ${LIBGGML_HTP_CAT}
+ DEPENDS libggml-htp.inf ${HTP_SKELS}
+ COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/libggml-htp.inf ${CMAKE_CURRENT_BINARY_DIR}
+ COMMAND ${INF2CAT} /driver:${CMAKE_CURRENT_BINARY_DIR} /os:10_25H2_ARM64
+ COMMAND ${SIGNTOOL} sign /fd sha256 /f ${GGML_HEXAGON_HTP_CERT} ${LIBGGML_HTP_CAT}
+ COMMENT "generating and signing libggml-htp.cat file"
+ VERBATIM
+ )
+
+ add_dependencies(${TARGET_NAME} libggml-htp-cat)
+ install(FILES ${LIBGGML_HTP_CAT} TYPE LIB)
+endif()
#ifdef _WIN32
# include <sal.h>
-# ifndef _WINDOWS
-# define _WINDOWS
-# endif
#else
# include <semaphore.h>
# include <unistd.h>
#pragma clang diagnostic ignored "-Wnested-anon-types"
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#include "htp-utils.h"
-
#include <AEEStdErr.h>
#include <dspqueue.h>
#include <rpcmem.h>
#include "op-desc.h"
#include "htp-msg.h"
#include "htp_iface.h"
+#include "htp-drv.h"
static size_t opt_ndev = 1;
static size_t opt_nhvx = 0; // use all
0, // flags - the framework will autoset this
n_bufs, // number of buffers
bufs, // buffer references
- sizeof(req),
+ sizeof(req), // Message length
(const uint8_t *) &req, // Message
- 1000000 // Timeout
+ DSPQUEUE_TIMEOUT // Timeout
);
if (err != 0) {
// Read response packet from queue
int err = dspqueue_read(q, &flags,
- HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
- &n_bufs, // Number of buffer references
- bufs, // Buffer references
- sizeof(rsp), // Max message length
- &rsp_size, // Message length
- (uint8_t *) &rsp,
- 1000000); // Timeout
+ HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
+ &n_bufs, // Number of buffer references
+ bufs, // Buffer references
+ sizeof(rsp), // Max message length
+ &rsp_size, // Message length
+ (uint8_t *) &rsp, // Message
+ DSPQUEUE_TIMEOUT); // Timeout
if (err == AEE_EEXPIRED) {
// TODO: might need to bail out if the HTP is stuck on something
ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
size += 4 * 1024; // extra page for padding
- if (rpcmem_alloc2) {
- this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
- } else {
- GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
- this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
- }
-
+ this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
if (!this->base) {
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
}
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
- return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
+ return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
}
static inline bool is_compute_op(ggml_tensor *node)
{
- return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
+ return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
}
// scan the graph and figure out last compute op index
const int last = last_compute_op(graph);
- const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
+ const struct ggml_tensor * prev_op = nullptr; // prev executed op
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * node = graph->nodes[i];
continue;
}
- if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
- continue;
- }
-
uint32_t flags = 0;
// skip quantizer if src1 is reused
- if (op_reuse_src1(node, prev_quant_op)) {
+ if (op_reuse_src1(node, prev_op)) {
flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
}
+ prev_op = node;
+
// ask for early notification for the last Op
if (i == last) {
flags |= HTP_OPFLAGS_EARLY_WAKEUP;
} else {
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
}
- prev_quant_op = node;
break;
case GGML_OP_MUL_MAT_ID:
if (ggml_is_quantized(node->src[0]->type)) {
} else {
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
}
- prev_quant_op = node;
break;
case GGML_OP_MUL:
case GGML_OP_ADD:
}
// that many nodes forward to search for stackable nodes that can reuse VTCM
- constexpr int N_FORWARD = 8;
+ constexpr int N_FORWARD = 16;
for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
if (used[i1]) {
}
}
+#if defined(__ANDROID__)
if (opt_arch < 75) {
opt_ndev = 1;
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
}
+#endif
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
opt_arch = strtoul(str_arch, NULL, 0);
}
+ opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
+
reg->context = new ggml_hexagon_registry(reg);
HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
+ auto nErr = htpdrv_init();
+ if (nErr != AEE_SUCCESS) {
+ return NULL;
+ }
+
ggml_hexagon_init(®);
}
--- /dev/null
+// sample drv interface
+
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wsign-compare"
+
+#include <filesystem>
+#include <set>
+#include <sstream>
+#include <string>
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+# include <windows.h>
+# include <winevt.h>
+#else
+# include <dlfcn.h>
+# include <unistd.h>
+#endif
+#include "ggml-impl.h"
+#include "htp-drv.h"
+#include "libdl.h"
+
+#include <domain.h>
+
+//
+// Driver API types
+//
+
+typedef void * (*rpcmem_alloc_pfn_t)(int heapid, uint32_t flags, int size);
+typedef void * (*rpcmem_alloc2_pfn_t)(int heapid, uint32_t flags, size_t size);
+typedef void (*rpcmem_free_pfn_t)(void * po);
+typedef int (*rpcmem_to_fd_pfn_t)(void * po);
+
+typedef AEEResult (*dspqueue_create_pfn_t)(int domain,
+ uint32_t flags,
+ uint32_t req_queue_size,
+ uint32_t resp_queue_size,
+ dspqueue_callback_t packet_callback,
+ dspqueue_callback_t error_callback,
+ void * callback_context,
+ dspqueue_t * queue);
+typedef AEEResult (*dspqueue_close_pfn_t)(dspqueue_t queue);
+typedef AEEResult (*dspqueue_export_pfn_t)(dspqueue_t queue, uint64_t *queue_id);
+typedef AEEResult (*dspqueue_write_pfn_t)(dspqueue_t queue, uint32_t flags,
+ uint32_t num_buffers,
+ struct dspqueue_buffer *buffers,
+ uint32_t message_length,
+ const uint8_t *message,
+ uint32_t timeout_us);
+typedef AEEResult (*dspqueue_read_pfn_t)(dspqueue_t queue, uint32_t *flags,
+ uint32_t max_buffers, uint32_t *num_buffers,
+ struct dspqueue_buffer *buffers,
+ uint32_t max_message_length,
+ uint32_t *message_length, uint8_t *message,
+ uint32_t timeout_us);
+
+typedef int (*fastrpc_mmap_pfn_t)(int domain, int fd, void *addr, int offset, size_t length, enum fastrpc_map_flags flags);
+typedef int (*fastrpc_munmap_pfn_t)(int domain, int fd, void *addr, size_t length);
+
+typedef int (*remote_handle64_open_pfn_t)(const char* name, remote_handle64 *ph);
+typedef int (*remote_handle64_invoke_pfn_t)(remote_handle64 h, uint32_t dwScalars, remote_arg *pra);
+typedef int (*remote_handle64_close_pfn_t)(remote_handle h);
+typedef int (*remote_handle_control_pfn_t)(uint32_t req, void* data, uint32_t datalen);
+typedef int (*remote_handle64_control_pfn_t)(remote_handle64 h, uint32_t req, void* data, uint32_t datalen);
+typedef int (*remote_session_control_pfn_t)(uint32_t req, void *data, uint32_t datalen);
+
+//
+// Driver API pfns
+//
+
+rpcmem_alloc_pfn_t rpcmem_alloc_pfn = nullptr;
+rpcmem_alloc2_pfn_t rpcmem_alloc2_pfn = nullptr;
+rpcmem_free_pfn_t rpcmem_free_pfn = nullptr;
+rpcmem_to_fd_pfn_t rpcmem_to_fd_pfn = nullptr;
+
+fastrpc_mmap_pfn_t fastrpc_mmap_pfn = nullptr;
+fastrpc_munmap_pfn_t fastrpc_munmap_pfn = nullptr;
+
+dspqueue_create_pfn_t dspqueue_create_pfn = nullptr;
+dspqueue_close_pfn_t dspqueue_close_pfn = nullptr;
+dspqueue_export_pfn_t dspqueue_export_pfn = nullptr;
+dspqueue_write_pfn_t dspqueue_write_pfn = nullptr;
+dspqueue_read_pfn_t dspqueue_read_pfn = nullptr;
+
+remote_handle64_open_pfn_t remote_handle64_open_pfn = nullptr;
+remote_handle64_invoke_pfn_t remote_handle64_invoke_pfn = nullptr;
+remote_handle64_close_pfn_t remote_handle64_close_pfn = nullptr;
+remote_handle_control_pfn_t remote_handle_control_pfn = nullptr;
+remote_handle64_control_pfn_t remote_handle64_control_pfn = nullptr;
+remote_session_control_pfn_t remote_session_control_pfn = nullptr;
+
+//
+// Driver API
+//
+
+void * rpcmem_alloc(int heapid, uint32_t flags, int size) {
+ return rpcmem_alloc_pfn(heapid, flags, size);
+}
+
+void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) {
+ if (rpcmem_alloc2_pfn) {
+ return rpcmem_alloc2_pfn(heapid, flags, size);
+ } else {
+ GGML_LOG_INFO("ggml-hex: rpcmem_alloc2 not found, falling back to rpcmem_alloc\n");
+ return rpcmem_alloc_pfn(heapid, flags, size);
+ }
+}
+
+void rpcmem_free(void * po) {
+ return rpcmem_free_pfn(po);
+}
+
+int rpcmem_to_fd(void * po) {
+ return rpcmem_to_fd_pfn(po);
+}
+
+HTPDRV_API int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
+ return fastrpc_mmap_pfn(domain, fd, addr, offset, length, flags);
+}
+
+HTPDRV_API int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
+ return fastrpc_munmap_pfn(domain, fd, addr, length);
+}
+
+AEEResult dspqueue_create(int domain,
+ uint32_t flags,
+ uint32_t req_queue_size,
+ uint32_t resp_queue_size,
+ dspqueue_callback_t packet_callback,
+ dspqueue_callback_t error_callback,
+ void * callback_context,
+ dspqueue_t * queue) {
+ return dspqueue_create_pfn(domain, flags, req_queue_size, resp_queue_size, packet_callback, error_callback,
+ callback_context, queue);
+}
+
+AEEResult dspqueue_close(dspqueue_t queue) {
+ return dspqueue_close_pfn(queue);
+}
+
+AEEResult dspqueue_export(dspqueue_t queue, uint64_t * queue_id) {
+ return dspqueue_export_pfn(queue, queue_id);
+}
+
+AEEResult dspqueue_write(dspqueue_t queue,
+ uint32_t flags,
+ uint32_t num_buffers,
+ struct dspqueue_buffer * buffers,
+ uint32_t message_length,
+ const uint8_t * message,
+ uint32_t timeout_us) {
+ return dspqueue_write_pfn(queue, flags, num_buffers, buffers, message_length, message, timeout_us);
+}
+
+AEEResult dspqueue_read(dspqueue_t queue,
+ uint32_t * flags,
+ uint32_t max_buffers,
+ uint32_t * num_buffers,
+ struct dspqueue_buffer * buffers,
+ uint32_t max_message_length,
+ uint32_t * message_length,
+ uint8_t * message,
+ uint32_t timeout_us) {
+ return dspqueue_read_pfn(queue, flags, max_buffers, num_buffers, buffers, max_message_length, message_length,
+ message, timeout_us);
+}
+
+HTPDRV_API int remote_handle64_open(const char * name, remote_handle64 * ph) {
+ return remote_handle64_open_pfn(name, ph);
+}
+
+HTPDRV_API int remote_handle64_invoke(remote_handle64 h, uint32_t dwScalars, remote_arg * pra) {
+ return remote_handle64_invoke_pfn(h, dwScalars, pra);
+}
+
+HTPDRV_API int remote_handle64_close(remote_handle64 h) {
+ return remote_handle64_close_pfn(h);
+}
+
+HTPDRV_API int remote_handle_control(uint32_t req, void * data, uint32_t datalen) {
+ return remote_handle_control_pfn(req, data, datalen);
+}
+
+HTPDRV_API int remote_handle64_control(remote_handle64 h, uint32_t req, void * data, uint32_t datalen) {
+ return remote_handle64_control_pfn(h, req, data, datalen);
+}
+
+HTPDRV_API int remote_session_control(uint32_t req, void * data, uint32_t datalen) {
+ return remote_session_control_pfn(req, data, datalen);
+}
+
+#ifdef _WIN32
+
+static std::string wstr_to_str(std::wstring_view wstr) {
+ std::string result;
+ if (wstr.empty()) {
+ return result;
+ }
+ auto bytes_needed = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
+ wstr.data(), (int) wstr.size(),
+ nullptr, 0, nullptr, nullptr);
+ if (bytes_needed == 0) {
+ GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
+ throw std::runtime_error("Invalid wstring input");
+ }
+
+ result.resize(bytes_needed, '\0');
+ int bytes_written = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
+ wstr.data(), (int) wstr.size(),
+ result.data(), bytes_needed,
+ nullptr, nullptr);
+ if (bytes_written == 0) {
+ GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
+ throw std::runtime_error("Wstring conversion failed");
+ }
+ return result;
+}
+
+static std::string get_driver_path() {
+ std::wstring serviceName = L"qcnspmcdm";
+ std::string result;
+
+ // Get a handle to the SCM database.
+ SC_HANDLE schSCManager = OpenSCManagerW(NULL, NULL, STANDARD_RIGHTS_READ);
+ if (nullptr == schSCManager) {
+ GGML_LOG_ERROR("ggml-hex: Failed to open SCManager. Error: %lu\n", GetLastError());
+ return result;
+ }
+
+ // Get a handle to the service.
+ SC_HANDLE schService = OpenServiceW(schSCManager, // SCM database
+ serviceName.c_str(), // name of service
+ SERVICE_QUERY_CONFIG); // need query config access
+
+ if (nullptr == schService) {
+ GGML_LOG_ERROR("ggml-hex: Failed to open qcnspmcdm service. Error: %lu\n", GetLastError());
+ CloseServiceHandle(schSCManager);
+ return result;
+ }
+
+ // Store the size of buffer used as an output.
+ DWORD bufferSize;
+ if (!QueryServiceConfigW(schService, NULL, 0, &bufferSize) &&
+ (GetLastError() != ERROR_INSUFFICIENT_BUFFER)) {
+ GGML_LOG_ERROR("ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
+ CloseServiceHandle(schService);
+ CloseServiceHandle(schSCManager);
+ return result;
+ }
+ // Get the configuration of the service.
+ LPQUERY_SERVICE_CONFIGW serviceConfig =
+ static_cast<LPQUERY_SERVICE_CONFIGW>(LocalAlloc(LMEM_FIXED, bufferSize));
+ if (!QueryServiceConfigW(schService, serviceConfig, bufferSize, &bufferSize)) {
+ fprintf(stderr, "ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
+ LocalFree(serviceConfig);
+ CloseServiceHandle(schService);
+ CloseServiceHandle(schSCManager);
+ return result;
+ }
+
+ // Read the driver file path get its parent directory
+ std::wstring driverPath = std::wstring(serviceConfig->lpBinaryPathName);
+ driverPath = driverPath.substr(0, driverPath.find_last_of(L"\\"));
+
+ // Clean up resources
+ LocalFree(serviceConfig);
+ CloseServiceHandle(schService);
+ CloseServiceHandle(schSCManager);
+
+ // Driver path would contain invalid path string, like:
+ // \SystemRoot\System32\DriverStore\FileRepository\qcadsprpc8280.inf_arm64_c2b9460c9a072f37
+ // "\SystemRoot" should be replace with a correct one (e.g. C:\Windows)
+ const std::wstring systemRootPlaceholder = L"\\SystemRoot";
+ if (0 != driverPath.compare(0, systemRootPlaceholder.length(), systemRootPlaceholder)) {
+ GGML_LOG_ERROR("ggml-hex: String pattern not found in driver path.\n");
+ return result;
+ }
+
+ // Replace \SystemRoot with an absolute path from system ENV windir
+ const std::wstring systemRootEnv = L"windir";
+
+ // Query the number of wide charactors this variable requires
+ DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
+ if (numWords == 0) {
+ GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
+ return result;
+ }
+
+ // Query the actual system root name from environment variable
+ std::vector<wchar_t> systemRoot(numWords + 1);
+ numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), systemRoot.data(), numWords + 1);
+ if (numWords == 0) {
+ GGML_LOG_ERROR("ggml-hex: Failed to read windir environment variable\n");
+ return result;
+ }
+ driverPath.replace(0, systemRootPlaceholder.length(), std::wstring(systemRoot.data()));
+
+ return wstr_to_str(driverPath);
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
+int htpdrv_init() {
+ static dl_handle_ptr lib_cdsp_rpc_handle = nullptr;
+ static bool initialized = false;
+#ifdef _WIN32
+ std::string drv_path = get_driver_path() + "\\" + "libcdsprpc.dll";
+#else
+ std::string drv_path = "libcdsprpc.so";
+#endif
+ if (initialized) {
+ GGML_LOG_INFO("ggml-hex: Driver already loaded\n");
+ return AEE_SUCCESS;
+ }
+ GGML_LOG_INFO("ggml-hex: Loading driver %s\n", drv_path.c_str());
+
+ fs::path path{ drv_path.c_str() };
+ dl_handle_ptr handle { dl_load_library(path) };
+ if (!handle) {
+ GGML_LOG_ERROR("ggml-hex: failed to load %s: %s\n", path.u8string().c_str(), dl_error());
+ return AEE_EUNABLETOLOAD;
+ }
+
+#define dlsym(drv, type, pfn, symbol, ignore) \
+ do { \
+ pfn = (type) dl_get_sym(drv, #symbol); \
+ if (!ignore && nullptr == pfn) { \
+ GGML_LOG_ERROR("ggml-hex: failed to dlsym %s\n", #symbol); \
+ return AEE_EUNABLETOLOAD; \
+ } \
+ } while (0)
+
+ dlsym(handle.get(), rpcmem_alloc_pfn_t, rpcmem_alloc_pfn, rpcmem_alloc, false);
+ dlsym(handle.get(), rpcmem_alloc2_pfn_t, rpcmem_alloc2_pfn, rpcmem_alloc2, true);
+ dlsym(handle.get(), rpcmem_free_pfn_t, rpcmem_free_pfn, rpcmem_free, false);
+ dlsym(handle.get(), rpcmem_to_fd_pfn_t, rpcmem_to_fd_pfn, rpcmem_to_fd, false);
+ dlsym(handle.get(), fastrpc_mmap_pfn_t, fastrpc_mmap_pfn, fastrpc_mmap, false);
+ dlsym(handle.get(), fastrpc_munmap_pfn_t, fastrpc_munmap_pfn, fastrpc_munmap, false);
+ dlsym(handle.get(), dspqueue_create_pfn_t, dspqueue_create_pfn, dspqueue_create, false);
+ dlsym(handle.get(), dspqueue_close_pfn_t, dspqueue_close_pfn, dspqueue_close, false);
+ dlsym(handle.get(), dspqueue_export_pfn_t, dspqueue_export_pfn, dspqueue_export, false);
+ dlsym(handle.get(), dspqueue_write_pfn_t, dspqueue_write_pfn, dspqueue_write, false);
+ dlsym(handle.get(), dspqueue_read_pfn_t, dspqueue_read_pfn, dspqueue_read, false);
+ dlsym(handle.get(), remote_handle64_open_pfn_t, remote_handle64_open_pfn, remote_handle64_open, false);
+ dlsym(handle.get(), remote_handle64_invoke_pfn_t, remote_handle64_invoke_pfn, remote_handle64_invoke, false);
+ dlsym(handle.get(), remote_handle_control_pfn_t, remote_handle_control_pfn, remote_handle_control, false);
+ dlsym(handle.get(), remote_handle64_control_pfn_t, remote_handle64_control_pfn, remote_handle64_control, false);
+ dlsym(handle.get(), remote_session_control_pfn_t, remote_session_control_pfn, remote_session_control, false);
+ dlsym(handle.get(), remote_handle64_close_pfn_t, remote_handle64_close_pfn, remote_handle64_close, false);
+
+ lib_cdsp_rpc_handle = std::move(handle);
+ initialized = true;
+
+ return AEE_SUCCESS;
+}
+
+domain * get_domain(int domain_id) {
+ int i = 0;
+ int size = sizeof(supported_domains) / sizeof(domain);
+
+ for (i = 0; i < size; i++) {
+ if (supported_domains[i].id == domain_id) {
+ return &supported_domains[i];
+ }
+ }
+
+ return NULL;
+}
+
+int get_hex_arch_ver(int domain, int * arch) {
+ if (!remote_handle_control_pfn) {
+ GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
+ return AEE_EUNSUPPORTEDAPI;
+ }
+
+ struct remote_dsp_capability arch_ver;
+ arch_ver.domain = (uint32_t) domain;
+ arch_ver.attribute_ID = ARCH_VER;
+ arch_ver.capability = (uint32_t) 0;
+
+ int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
+ if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
+ GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
+ return AEE_EUNSUPPORTEDAPI;
+ }
+
+ if (err != AEE_SUCCESS) {
+ GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
+ return err;
+ }
+
+ switch (arch_ver.capability & 0xff) {
+ case 0x68:
+ *arch = 68;
+ return 0;
+ case 0x69:
+ *arch = 69;
+ return 0;
+ case 0x73:
+ *arch = 73;
+ return 0;
+ case 0x75:
+ *arch = 75;
+ return 0;
+ case 0x79:
+ *arch = 79;
+ return 0;
+ case 0x81:
+ *arch = 81;
+ return 0;
+ }
+ return -1;
+}
--- /dev/null
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+# pragma clang diagnostic ignored "-Wignored-attributes"
+#endif
+
+#include <AEEStdErr.h>
+#include <rpcmem.h>
+#include <remote.h>
+#include <dspqueue.h>
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+# ifdef GGML_BACKEND_BUILD
+# define HTPDRV_API __declspec(dllexport) extern
+# else
+# define HTPDRV_API __declspec(dllimport) extern
+# endif
+#else
+# define HTPDRV_API __attribute__ ((visibility ("default"))) extern
+#endif
+
+/* Offset to differentiate HLOS and Hexagon error codes.
+ Stores the value of AEE_EOFFSET for Hexagon. */
+#ifndef DSP_OFFSET
+# define DSP_OFFSET 0x80000400
+#endif
+
+/* Errno for connection reset by peer. */
+#ifndef ECONNRESET
+# ifdef __hexagon__
+# define ECONNRESET 104
+# endif
+#endif
+
+/* Abstraction of different OS specific sleep APIs.
+ SLEEP accepts input in seconds. */
+#ifndef SLEEP
+# ifdef __hexagon__
+# define SLEEP(x) \
+ { /* Do nothing for simulator. */ \
+ }
+# else
+# ifdef _WIN32
+# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
+# else
+# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
+# endif
+# endif
+#endif
+
+/* Include windows specific header files. */
+#ifdef _WIN32
+# include <windows.h>
+# include <sysinfoapi.h>
+# define _CRT_SECURE_NO_WARNINGS 1
+# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
+#endif
+
+/* Includes and defines for all HLOS except windows */
+#if !defined(__hexagon__) && !defined(_WIN32)
+# include "unistd.h"
+
+# include <sys/time.h>
+#endif
+
+/* Includes and defines for Hexagon and all HLOS except Windows. */
+#if !defined(_WIN32)
+/* Weak reference to remote symbol for compilation. */
+# pragma weak remote_session_control
+# pragma weak remote_handle_control
+# pragma weak remote_handle64_control
+# pragma weak fastrpc_mmap
+# pragma weak fastrpc_munmap
+# pragma weak rpcmem_alloc2
+#endif
+
+#if !defined(_WIN32)
+# pragma weak remote_system_request
+#endif
+
+#ifdef _WIN32
+# define DSPQUEUE_TIMEOUT DSPQUEUE_TIMEOUT_NONE
+#else
+# define DSPQUEUE_TIMEOUT 1000000
+#endif
+
+/**
+ * htpdrv_init API: driver interface entry point
+ *
+ * @return Return AEE error codes as defined in Hexagon SDK.
+ */
+HTPDRV_API int htpdrv_init(void);
+
+/**
+ * get_domain API: get domain struct from domain value.
+ *
+ * @param[in] domain value of a domain
+ * @return Returns domain struct of the domain if it is supported or else
+ * returns NULL.
+ *
+ */
+HTPDRV_API domain * get_domain(int domain_id);
+
+/**
+ * get_hex_arch_ver API: query the Hexagon processor architecture version information
+ *
+ * @param[in] domain_id value of a domain
+ * @param[out] Arch version (73, 75, ...)
+ * @return 0 if query is successful.
+ * non-zero if error, return value points to the error.
+ *
+ */
+HTPDRV_API int get_hex_arch_ver(int domain, int * arch);
+
+#ifdef __cplusplus
+}
+#endif
+++ /dev/null
-
-#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#pragma clang diagnostic ignored "-Wmissing-prototypes"
-#pragma clang diagnostic ignored "-Wsign-compare"
-
-#define GGML_COMMON_IMPL_C
-#include "ggml-backend-impl.h"
-#include "ggml-common.h"
-#include "ggml-hexagon.h"
-#include "ggml-impl.h"
-
-#include "htp-utils.h"
-
-#include <domain.h>
-#include <remote.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-domain * get_domain(int domain_id) {
- int i = 0;
- int size = sizeof(supported_domains) / sizeof(domain);
-
- for (i = 0; i < size; i++) {
- if (supported_domains[i].id == domain_id) {
- return &supported_domains[i];
- }
- }
-
- return NULL;
-}
-
-bool is_valid_domain_id(int domain_id, int compute_only) {
- int i = 0;
- int size = sizeof(supported_domains) / sizeof(domain);
-
- if (compute_only) {
- return is_CDSP(domain_id);
- }
-
- for (i = 0; i < size; i++) {
- if (supported_domains[i].id == domain_id) {
- return true;
- }
- }
-
- return false;
-}
-
-int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
- int nErr = AEE_SUCCESS;
- int ss_info = 0;
- if (domain_type != NULL) {
- if (strcmp(domain_type, "LPASS") == 0) {
- ss_info = FASTRPC_LPASS;
- } else if (strcmp(domain_type, "HPASS") == 0) {
- ss_info = FASTRPC_HPASS;
- } else {
- ss_info = FASTRPC_NSP;
- }
- }
- system_req_payload req = { 0 };
- req.id = FASTRPC_GET_DOMAINS;
- req.sys.domains = NULL;
- fastrpc_domain * domain = NULL;
- if (ss_info != 0) {
- req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
- } else {
- req.sys.flags = 0;
- }
-#ifdef _WIN32
- nErr = AEE_EUNSUPPORTED;
- goto bail;
-#endif
- if (remote_system_request) {
- nErr = remote_system_request(&req);
- if (nErr != AEE_SUCCESS) {
- GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
- goto bail;
- }
- // Allocate memory for domain-info array
- req.sys.max_domains = req.sys.num_domains;
- if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
- nErr = AEE_ENOMEMORY;
- GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
- goto bail;
- }
-
- nErr = remote_system_request(&req);
- if (nErr != AEE_SUCCESS) {
- GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
- goto bail;
- }
-
- for (int i = 0; i < req.sys.num_domains; i++) {
- // Verify that only requested type domains were returned
- domain = &req.sys.domains[i];
- if (domain->type != ss_info && domain_type != NULL) {
- nErr = -1;
- GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
- goto bail;
- }
- }
- *domains_info = req.sys.domains;
- *num_domains = req.sys.num_domains;
- } else {
- nErr = AEE_EUNSUPPORTED;
- goto bail;
- }
-bail:
- if (nErr && !req.sys.domains) {
- free(req.sys.domains);
- }
- return nErr;
-}
-
-int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
- int err = 0;
- remote_rpc_effective_domain_id_t sess = { 0 };
-
- sess.domain_name = domain_name;
- sess.domain_name_len = strlen(domain_name);
- sess.session_id = session_id;
-
- err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
- if (err) {
- GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
- session_id);
- return err;
- }
-
- *effec_domain_id = sess.effective_domain_id;
- return err;
-}
-
-int get_dsp_support(int * domain) {
- int nErr = AEE_SUCCESS;
- *domain = CDSP_DOMAIN_ID; // DSP domain default value is CDSP_DOMAIN_ID
-
- if (remote_handle_control) {
- struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
- goto bail;
- }
-
- if (dsp_capability_domain.capability == 0) {
- dsp_capability_domain.domain = ADSP_DOMAIN_ID; // Check for ADSP support.
- dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
- dsp_capability_domain.capability = 0;
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
- sizeof(struct remote_dsp_capability));
- if (dsp_capability_domain.capability) {
- *domain = ADSP_DOMAIN_ID; // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
- }
- }
-
- if (nErr != AEE_SUCCESS) {
- GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
- }
-
-bail:
- return nErr;
-}
-
-int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
- int nErr = AEE_SUCCESS;
- *capability = 0;
-
- if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
- } else {
- nErr = AEE_EBADPARM;
- GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
- goto bail;
- }
- if (remote_handle_control) {
- if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
- /*
- * Query the DSP for VTCM information
- * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
- */
- struct remote_dsp_capability dsp_capability_vtcm_dsp;
- dsp_capability_vtcm_dsp.domain = (uint32_t) domain;
- dsp_capability_vtcm_dsp.attribute_ID = attr;
- dsp_capability_vtcm_dsp.capability = (uint32_t) 0;
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
- sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
- GGML_LOG_ERROR("Running the usecase without checking the capability\n");
- nErr = AEE_SUCCESS;
- goto bail;
- } else if (nErr == AEE_SUCCESS) {
- *capability = dsp_capability_vtcm_dsp.capability;
- } else {
- GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTED;
- GGML_LOG_ERROR("Unsupported domain %d\n", domain);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
- }
-
-bail:
- return nErr;
-}
-
-bool is_unsignedpd_supported(int domain_id) {
- int nErr = AEE_SUCCESS;
- if (remote_handle_control) {
- struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
- return false;
- }
- if (nErr) {
- GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
- return false;
- }
- if (dsp_capability_domain.capability == 1) {
- return true;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
- return false;
- }
- return false;
-}
-
-bool get_unsignedpd_support(void) {
- return is_unsignedpd_supported(CDSP_DOMAIN_ID);
-}
-
-bool is_async_fastrpc_supported(int domain) {
- int nErr = AEE_SUCCESS;
- if (remote_handle_control) {
- if (domain == CDSP_DOMAIN_ID) {
- /*
- * Query the DSP for ASYNC_FASTRPC_SUPPORT information
- * Async fastrpc is supported only on CDSP
- */
- struct remote_dsp_capability dsp_capability_async_support;
- dsp_capability_async_support.domain = (uint32_t) domain;
- dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
- dsp_capability_async_support.capability = (uint32_t) 0;
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
- sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
- GGML_LOG_ERROR("Running the usecase without checking the capability\n");
- nErr = AEE_SUCCESS;
- goto bail;
- } else if (dsp_capability_async_support.capability == 1) {
- return true;
- }
- if (nErr != AEE_SUCCESS) {
- GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTED;
- GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
- }
-
-bail:
- return false;
-}
-
-bool is_status_notification_supported(int domain) {
- int nErr = AEE_SUCCESS;
-
- if (remote_handle_control) {
- /*
- * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
- * DSP User PD status notification Support
- */
- struct remote_dsp_capability dsp_capability_status_notification_support;
- dsp_capability_status_notification_support.domain = (uint32_t) domain;
- dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
- dsp_capability_status_notification_support.capability = (uint32_t) 0;
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
- sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
- GGML_LOG_ERROR("Running the usecase without checking the capability\n");
- nErr = AEE_SUCCESS;
- goto bail;
- } else if (dsp_capability_status_notification_support.capability == 1) {
- return true;
- }
- if (nErr != AEE_SUCCESS) {
- GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
- }
-
-bail:
- return false;
-}
-
-int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
- int nErr = AEE_SUCCESS;
- *capability = 0;
-
- if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
- nErr = AEE_EBADPARM;
- GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
- goto bail;
- }
- if (remote_handle_control) {
- if (domain == CDSP_DOMAIN_ID) {
- /*
- * Query the DSP for HMX SUPPORT information
- * HMX is supported on CDSP only
- */
- struct remote_dsp_capability dsp_capability_hmx_dsp;
- dsp_capability_hmx_dsp.domain = (uint32_t) domain;
- dsp_capability_hmx_dsp.attribute_ID = attr;
- dsp_capability_hmx_dsp.capability = (uint32_t) 0;
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
- sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
- GGML_LOG_ERROR("Running the usecase without checking the capability\n");
- nErr = AEE_SUCCESS;
- goto bail;
- } else if (nErr == AEE_SUCCESS) {
- *capability = dsp_capability_hmx_dsp.capability;
- } else {
- GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTED;
- GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
- }
-
-bail:
- return nErr;
-}
-
-int get_hex_arch_ver(int domain, int * arch) {
- if (!remote_handle_control) {
- GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
- return AEE_EUNSUPPORTEDAPI;
- }
-
- struct remote_dsp_capability arch_ver;
- arch_ver.domain = (uint32_t) domain;
- arch_ver.attribute_ID = ARCH_VER;
- arch_ver.capability = (uint32_t) 0;
-
- int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
- if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
- GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
- return AEE_EUNSUPPORTEDAPI;
- }
-
- if (err != AEE_SUCCESS) {
- GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
- return err;
- }
-
- switch (arch_ver.capability & 0xff) {
- case 0x68:
- *arch = 68;
- return 0;
- case 0x69:
- *arch = 69;
- return 0;
- case 0x73:
- *arch = 73;
- return 0;
- case 0x75:
- *arch = 75;
- return 0;
- case 0x79:
- *arch = 79;
- return 0;
- case 0x81:
- *arch = 81;
- return 0;
- }
- return -1;
-}
-
-int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
- int nErr = AEE_SUCCESS;
- *capability = 0;
-
- if (remote_handle_control) {
- if (domain == CDSP_DOMAIN_ID) {
- /*
- * Query the DSP for HVX SUPPORT information
- * HVX is supported on CDSP only
- */
- struct remote_dsp_capability dsp_capability_hvx_dsp;
- dsp_capability_hvx_dsp.domain = (uint32_t) domain;
- dsp_capability_hvx_dsp.attribute_ID = attr;
- dsp_capability_hvx_dsp.capability = (uint32_t) 0;
- nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
- sizeof(struct remote_dsp_capability));
- if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
- GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
- GGML_LOG_ERROR("Running the usecase without checking the capability\n");
- nErr = AEE_SUCCESS;
- goto bail;
- } else if (nErr == AEE_SUCCESS) {
- *capability = dsp_capability_hvx_dsp.capability;
- } else {
- GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTED;
- GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
- goto bail;
- }
- } else {
- nErr = AEE_EUNSUPPORTEDAPI;
- GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
- }
-
-bail:
- return nErr;
-}
+++ /dev/null
-#ifndef HTP_UTILS_H
-#define HTP_UTILS_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <AEEStdErr.h>
-#include <inttypes.h>
-#include <remote.h>
-#include <rpcmem.h>
-#include <stdbool.h>
-
-/* Offset to differentiate HLOS and Hexagon error codes.
- Stores the value of AEE_EOFFSET for Hexagon. */
-#ifndef DSP_OFFSET
-# define DSP_OFFSET 0x80000400
-#endif
-
-/* Errno for connection reset by peer. */
-#ifndef ECONNRESET
-# ifdef __hexagon__
-# define ECONNRESET 104
-# endif
-#endif
-
-/* Abstraction of different OS specific sleep APIs.
- SLEEP accepts input in seconds. */
-#ifndef SLEEP
-# ifdef __hexagon__
-# define SLEEP(x) \
- { /* Do nothing for simulator. */ \
- }
-# else
-# ifdef _WINDOWS
-# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
-# else
-# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
-# endif
-# endif
-#endif
-
-/* Include windows specific header files. */
-#ifdef _WINDOWS
-# include <sysinfoapi.h>
-# include <windows.h>
-# define _CRT_SECURE_NO_WARNINGS 1
-# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
-/* Including this file for custom implementation of getopt function. */
-# include "getopt_custom.h"
-#endif
-
-/* Includes and defines for all HLOS except windows */
-#if !defined(__hexagon__) && !defined(_WINDOWS)
-# include "unistd.h"
-
-# include <sys/time.h>
-#endif
-
-/* Includes and defines for Hexagon and all HLOS except Windows. */
-#if !defined(_WINDOWS)
-/* Weak reference to remote symbol for compilation. */
-# pragma weak remote_session_control
-# pragma weak remote_handle_control
-# pragma weak remote_handle64_control
-# pragma weak fastrpc_mmap
-# pragma weak fastrpc_munmap
-# pragma weak rpcmem_alloc2
-#endif
-
-#if !defined(_WINDOWS)
-# pragma weak remote_system_request
-#endif
-/**
- * Wrapper for FastRPC Capability API: query DSP support.
- *
- * @param[out] domain pointer to supported domain.
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- */
-int get_dsp_support(int * domain);
-
-/**
- * Wrapper for FastRPC Capability API: query VTCM information.
- *
- * @param[in] domain value of domain in the queried.
- * @param[out] capability capability value of the attribute queried.
- * @param[in] attr value of the attribute to the queried.
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- */
-int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
-
-/**
- * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
- *
- * @return true if unsigned pd is supported.
- * false if unsigned pd is not supported, capability query failed.
- */
-
-bool get_unsignedpd_support(void);
-
-/**
- * Wrapper for FastRPC Capability API: query unsigned pd support.
- *
- * @param[in] domain value of domain in the queried.
- * @return true if unsigned pd is supported.
- * false if unsigned pd is not supported, capability query failed.
- */
-
-bool is_unsignedpd_supported(int domain_id);
-
-/**
- * is_valid_domain_id API: query a domain id is valid.
- *
- * @param[in] domain value of domain in the queried.
- * @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
- * @return true if value of domain is valid.
- * false if value of domain is not valid.
- */
-
-bool is_valid_domain_id(int domain_id, int compute_only);
-
-/**
- * get_domain API: get domain struct from domain value.
- *
- * @param[in] domain value of a domain
- * @return Returns domain struct of the domain if it is supported or else
- * returns NULL.
- *
- */
-
-domain * get_domain(int domain_id);
-
-/**
- * get_domains_info API: get information for all the domains available on the device
- *
- * @param[in] domain_type pointer to domain type
- * @param[in] num_domains pointer to number of domains
- * @param[in] domains_info pointer to save discovered domains information.
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- *
- * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
- *
- */
-
-int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
-
-/**
- * get_effective_domain_id API: get effective domain id for given session id
- *
- * @param[in] domain_name pointer to domain name
- * @param[in] session_id
- * @param[in] effec_domain_id pointer to save obtained effective domain id.
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- *
- */
-
-int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
-
-/**
- * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
- *
- * @param[in] domain_id value of a domain
- * @return Returns true or false stating support of Async FastRPC
- *
- */
-
-bool is_async_fastrpc_supported(int domain_id);
-
-/**
- * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
- *
- * @param[in] domain_id value of a domain
- * @return Returns true or false stating status notification support information
- *
- */
-bool is_status_notification_supported(int domain_id);
-
-/**
- * get_hmx_support_info API: query the DSP for HMX SUPPORT information
- *
- * @param[in] domain_id value of a domain
- * @param[out] capability capability value of the attribute queried.
- * @param[in] attr value of the attribute to the queried.
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- *
- */
-int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
-
-/**
- * get_hex_arch_ver API: query the Hexagon processor architecture version information
- *
- * @param[in] domain_id value of a domain
- * @param[out] Arch version (73, 75, ...)
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- *
- */
-int get_hex_arch_ver(int domain, int * arch);
-
-/**
- * get_hvx_support_info API: query the DSP for HVX SUPPORT information
- *
- * @param[in] domain_id value of a domain
- * @param[out] capability capability value of the attribute queried.
- * @param[in] attr value of the attribute to the queried.
- * @return 0 if query is successful.
- * non-zero if error, return value points to the error.
- *
- */
-int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif //DSP_CAPABILITIES_UTILS_H
--- /dev/null
+#pragma once
+
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+# include <windows.h>
+# include <winevt.h>
+#else
+# include <dlfcn.h>
+# include <unistd.h>
+#endif
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+ void operator()(HMODULE handle) {
+ FreeLibrary(handle);
+ }
+};
+
+static inline dl_handle * dl_load_library(const fs::path & path) {
+ // suppress error dialogs for missing DLLs
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+ HMODULE handle = LoadLibraryW(path.wstring().c_str());
+
+ SetErrorMode(old_mode);
+
+ return handle;
+}
+
+static inline void * dl_get_sym(dl_handle * handle, const char * name) {
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+ void * p = (void *) GetProcAddress(handle, name);
+
+ SetErrorMode(old_mode);
+
+ return p;
+}
+
+static inline const char * dl_error() {
+ return "";
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+ void operator()(void * handle) {
+ dlclose(handle);
+ }
+};
+
+static inline dl_handle * dl_load_library(const fs::path & path) {
+ dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+ return handle;
+}
+
+static inline void * dl_get_sym(dl_handle * handle, const char * name) {
+ return dlsym(handle, name);
+}
+
+static inline const char * dl_error() {
+ const char *rslt = dlerror();
+ return rslt != nullptr ? rslt : "";
+}
+
+#endif
--- /dev/null
+[Version]
+Signature = "$WINDOWS NT$"
+Class = ComputeAccelerator
+ClassGuid = {F01A9D53-3FF6-48D2-9F97-C8A7004BE10C}
+Provider = %GGML%
+DriverVer = 01/01/2026,1.0.0.0
+CatalogFile = libggml-htp.cat
+PnpLockDown = 1
+
+[DestinationDirs]
+Drivers_Dir = 6
+
+[SourceDisksNames]
+1 = %DiskId%
+
+[SourceDisksFiles]
+libggml-htp-v68.so = 1
+libggml-htp-v69.so = 1
+libggml-htp-v73.so = 1
+libggml-htp-v75.so = 1
+libggml-htp-v81.so = 1
+
+[ControlFlags]
+ExcludeFromSelect = *
+
+[DefaultInstall.NTarm64]
+CopyFiles=Drivers_Dir
+
+[Drivers_Dir]
+libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+
+[Strings]
+GGML = 'GGML'
+DiskId = 'GGML HTP library'
--- /dev/null
+
+#!/usr/bin/env pwsh
+
+# Basedir on device
+$basedir=".\pkg-snapdragon"
+
+$cli_opts=$args
+
+$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+if ($null -ne $env:M) {
+ $model=$env:M
+}
+
+$device="HTP0"
+if ($null -ne $env:D) {
+ $device=$env:D
+}
+
+if ($null -ne $env:V) {
+ $env:GGML_HEXAGON_VERBOSE=$env:V
+}
+
+if ($null -ne $env:OPMASK) {
+ $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+}
+
+if ($null -ne $env:NHVX) {
+ $env:GGML_HEXAGON_NHVX=$env:NHVX
+}
+
+if ($null -ne $env:NDEV) {
+ $env:GGML_HEXAGON_NDEV=$env:NDEV
+}
+
+$env:ADSP_LIBRARY_PATH="$basedir\lib"
+
+& "$basedir\bin\llama-bench.exe" `
+ --mmap 0 -m $basedir\..\..\gguf\$model `
+ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
+ --batch-size 128 -ngl 99 --device $device $cli_opts
--- /dev/null
+
+#!/usr/bin/env pwsh
+
+# Basedir on device
+$basedir=".\pkg-snapdragon"
+
+$cli_opts=$args
+
+$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+if ($null -ne $env:M) {
+ $model=$env:M
+}
+
+$device="HTP0"
+if ($null -ne $env:D) {
+ $device=$env:D
+}
+
+if ($null -ne $env:V) {
+ $env:GGML_HEXAGON_VERBOSE=$env:V
+}
+
+if ($null -ne $env:E) {
+ $env:GGML_HEXAGON_EXPERIMENTAL=$env:E
+}
+
+if ($null -ne $env:SCHED) {
+ $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
+}
+
+if ($null -ne $env:PROF) {
+ $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+}
+
+if ($null -ne $env:OPMASK) {
+ $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+}
+
+if ($null -ne $env:NHVX) {
+ $env:GGML_HEXAGON_NHVX=$env:NHVX
+}
+
+if ($null -ne $env:NDEV) {
+ $env:GGML_HEXAGON_NDEV=$env:NDEV
+}
+
+$env:ADSP_LIBRARY_PATH="$basedir\lib"
+
+& "$basedir\bin\llama-completion.exe" `
+ --no-mmap -no-cnv -m $basedir\..\..\gguf\$model `
+ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
+ --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on `
+ -ngl 99 --device $device $cli_opts
--- /dev/null
+
+#!/usr/bin/env pwsh
+
+# Basedir on device
+$basedir=".\pkg-snapdragon"
+
+if ($args.Count -eq 0) {
+ Write-Host "No arguments provided.Expected the tool and argument to run."
+ exit -1
+}
+
+$tool=$args[0]
+$cli_opts=@()
+
+if ($args.Count -gt 1) {
+ $cli_opts=$args[1..($args.Count - 1)]
+ $remainingArgs = $args[1..($args.Count - 1)]
+}
+
+$device="HTP0"
+if ($null -ne $env:D) {
+ $device=$env:D
+}
+
+if ($null -ne $env:V) {
+ $env:GGML_HEXAGON_VERBOSE=$env:V
+}
+
+if ($null -ne $env:E) {
+ $env:GGML_HEXAGON_EXPERIMENTAL=$env:E
+}
+
+if ($null -ne $env:SCHED) {
+ $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
+}
+
+if ($null -ne $env:PROF) {
+ $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+}
+
+if ($null -ne $env:OPMASK) {
+ $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+}
+
+if ($null -ne $env:NHVX) {
+ $env:GGML_HEXAGON_NHVX=$env:NHVX
+}
+
+if ($null -ne $env:NDEV) {
+ $env:GGML_HEXAGON_NDEV=$env:NDEV
+}
+
+$env:ADSP_LIBRARY_PATH="$basedir\lib"
+
+& "$basedir\bin\$tool" `
+ $cli_opts
--- /dev/null
+# Requires Run as Administrator is NOT strictly necessary for User-scope env vars,
+# but recommended for creating directories in C:\ root if permissions are restricted.
+
+$ErrorActionPreference = "Stop"
+
+# --- Configuration ---
+$BaseDir = "C:\Qualcomm"
+
+# SDK 1: Hexagon
+$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz"
+$HexagonParent = Join-Path $BaseDir "Hexagon_SDK"
+$HexagonSdkVersion = "6.4.0.2"
+$HexagonToolsVersion = "19.0.04"
+$HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion
+$HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion"
+
+# SDK 2: OpenCL
+$OpenCLUrl = "https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz"
+$OpenCLParent = Join-Path $BaseDir "OpenCL_SDK"
+$OpenCLVersion = "2.3.2"
+$OpenCLTarget = Join-Path $OpenCLParent $OpenCLVersion
+
+# --- Helper Function ---
+function Install-QualcommSDK {
+ param (
+ [string]$Url,
+ [string]$ParentDir,
+ [string]$TargetDir,
+ [string]$Name
+ )
+
+ # 1. Create Parent Directory
+ if (-not (Test-Path -Path $ParentDir)) {
+ Write-Host "Creating directory: $ParentDir" -ForegroundColor Cyan
+ New-Item -Path $ParentDir -ItemType Directory -Force | Out-Null
+ }
+
+ # 2. Check for Specific Version Directory
+ if (Test-Path -Path $TargetDir) {
+ Write-Host "$Name ($TargetDir) already exists. Skipping download." -ForegroundColor Green
+ }
+ else {
+ Write-Host "$Name not found. preparing to download..." -ForegroundColor Yellow
+
+ # Create the target directory to extract into
+ New-Item -Path $TargetDir -ItemType Directory -Force | Out-Null
+
+ # Define temporary archive path
+ $TempFile = Join-Path $ParentDir "temp_sdk.tar.xz"
+
+ try {
+ # Download
+ Write-Host "Downloading from: $Url"
+ Invoke-WebRequest -Uri $Url -OutFile $TempFile
+
+ # Untar
+ # Note: We assume Windows includes tar.exe (Win 10 build 17063+)
+ Write-Host "Extracting archive to $TargetDir..."
+
+ # We use -C to extract contents INTO the target directory created above
+ tar -xJvf $TempFile -C $TargetDir\..
+
+ Write-Host "Extraction complete." -ForegroundColor Green
+ }
+ catch {
+ Write-Error "Failed to download or extract $Name. Error: $_"
+ # Cleanup target dir if failed so script tries again next time
+ Remove-Item -Path $TargetDir -Recurse -Force -ErrorAction SilentlyContinue
+ }
+ finally {
+ # Cleanup Archive
+ if (Test-Path $TempFile) { Remove-Item $TempFile -Force }
+ }
+ }
+}
+
+# --- Execution ---
+
+# 1. Ensure Base C:\Qualcomm exists
+if (-not (Test-Path $BaseDir)) {
+ New-Item -Path $BaseDir -ItemType Directory -Force | Out-Null
+}
+
+# 2. Run Install Logic
+Install-QualcommSDK -Url $HexagonUrl -ParentDir $HexagonParent -TargetDir $HexagonSdkTarget -Name "Hexagon SDK"
+Install-QualcommSDK -Url $OpenCLUrl -ParentDir $OpenCLParent -TargetDir $OpenCLTarget -Name "OpenCL SDK"
+
+# --- Environment Variables ---
+
+Write-Host "`nSetting Environment Variables..." -ForegroundColor Cyan
+
+# Set OPENCL_SDK_ROOT
+[System.Environment]::SetEnvironmentVariable('OPENCL_SDK_ROOT', $OpenCLTarget, [System.EnvironmentVariableTarget]::User)
+$env:OPENCL_SDK_ROOT = $OpenCLTarget # Set for current session as well
+Write-Host "OPENCL_SDK_ROOT set to: $OpenCLTarget"
+
+# Set HEXAGON_SDK_ROOT
+[System.Environment]::SetEnvironmentVariable('HEXAGON_SDK_ROOT', $HexagonSdkTarget, [System.EnvironmentVariableTarget]::User)
+$env:HEXAGON_SDK_ROOT = $HexagonSdkTarget # Set for current session as well
+Write-Host "HEXAGON_SDK_ROOT set to: $HexagonSdkTarget"
+
+# Set HEXAGON_SDK_ROOT
+[System.Environment]::SetEnvironmentVariable('HEXAGON_TOOLS_ROOT', $HexagonToolsTarget, [System.EnvironmentVariableTarget]::User)
+$env:HEXAGON_TOOLS_ROOT = $HexagonToolsTarget # Set for current session as well
+Write-Host "HEXAGON_TOOLS_ROOT set to: $HexagonToolsTarget"