From: Todor Boinovski Date: Thu, 29 Jan 2026 20:33:21 +0000 (-0800) Subject: hexagon: enable offloading to Hexagon on Windows on Snapdragon (#19150) X-Git-Tag: upstream/0.0.8067~191 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=ce38a4db478b90542874cd4af5cb48b3a0fcf311;p=pkg%2Fggml%2Fsources%2Fllama.cpp hexagon: enable offloading to Hexagon on Windows on Snapdragon (#19150) * hexagon: updates to enable offloading to HTP on WoS * Update windows.md * Update windows.md * hexagon: enable -O3 optimizations * hexagon: move all _WINDOWS conditional compilation to _WIN32 * hexagon: updates to enable offloading to HTP on WoS * hexagon: use run-time vs load-time dynamic linking for cdsp driver interface * refactor htp-drv * hexagon: add run-bench.ps1 script * hexagon: htdrv refactor * hexagon: unify Android and Windows build readmes * hexagon: update README.md * hexagon: refactor htpdrv * hexagon: drv refactor * hexagon: more drv refactor * hexagon: fixes for android builds * hexagon: factor out dl into ggml-backend-dl * hexagon: add run-tool.ps1 script * hexagon: merge htp-utils in htp-drv and remove unused code * wos: no need for getopt_custom.h * wos: add missing CR in htpdrv * hexagon: ndev enforecement applies only to the Android devices * hexagon: add support for generating and signing .cat file * hexagon: add .inf file * hexagon: working auto-signing and improved windows builds * hexagon: futher improve skel build * hexagon: add rough WoS guide * hexagon: updated windows guide * hexagon: improve cmake handling of certs and logging * hexagon: improve windows setup/build doc * hexagon: more windows readme updates * hexagon: windows readme updates * hexagon: windows readme updates * hexagon: windows readme updates * hexagon: windows readme updates * Update windows.md * Update windows.md * snapdragon: rename docs/backend/hexagon to docs/backends/snapdragon Also added a power shell script to simplify build env setup. * hexagon: remove trailing whitespace and move cmake requirement to user-presets * hexagon: fix CMakeUserPresets path in workflow yaml * hexagon: introduce local version of libdl.h * hexagon: fix src1 reuse logic gpt-oss needs a bigger lookahead window. The check for src[1] itself being quantized was wrong. --------- Co-authored-by: Max Krasnyansky --- diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 551bdd3df..d3a1638f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1371,7 +1371,7 @@ jobs: id: update_presets if: ${{ matrix.build == 'arm64-snapdragon' }} run: | - cp docs/backend/hexagon/CMakeUserPresets.json . + cp docs/backend/snapdragon/CMakeUserPresets.json . - name: Build id: ndk_build diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json deleted file mode 100644 index 1f2676c0b..000000000 --- a/docs/backend/hexagon/CMakeUserPresets.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "version": 4, - "configurePresets": [ - { - "name": "arm64-android-snapdragon", - "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x86_64", "strategy": "external" }, - "cacheVariables": { - "ANDROID_ABI": "arm64-v8a", - "ANDROID_PLATFORM": "android-31", - "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake", - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", - "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", - "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", - "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", - "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", - "PREBUILT_LIB_DIR": "android_aarch64", - "GGML_OPENMP": "OFF", - "GGML_LLAMAFILE": "OFF", - "GGML_OPENCL": "ON", - "GGML_HEXAGON": "ON", - "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", - "LLAMA_OPENSSL": "OFF" - } - }, - - { - "name": "arm64-windows-snapdragon", - "inherits": [ "base", "arm64-windows-llvm" ], - "cacheVariables": { - "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", - "PREBUILT_LIB_DIR": "windows_aarch64", - "GGML_OPENMP": "OFF", - "GGML_LLAMAFILE": "OFF", - "GGML_OPENCL": "ON", - "GGML_HEXAGON": "ON", - "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", - "LLAMA_OPENSSL": "OFF" - } - }, - - { "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] }, - { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] }, - - { "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] }, - { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] } - ] -} diff --git a/docs/backend/hexagon/README.md b/docs/backend/hexagon/README.md deleted file mode 100644 index 3befdf722..000000000 --- a/docs/backend/hexagon/README.md +++ /dev/null @@ -1,243 +0,0 @@ -# Snapdragon-based Android devices - -## How to Build - -The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain). -This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. - -This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop. - -``` -~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3 -[d]/> cd /workspace -``` - -The rest of the Android build process assumes that you're running inside the toolchain container. -Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets: - -``` -[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json . - -[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon -Preset CMake variables: - ANDROID_ABI="arm64-v8a" - ... - CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake" - GGML_HEXAGON="ON" - GGML_OPENCL="ON" - GGML_OPENMP="OFF" - HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2" -... --- Including OpenCL backend --- Including Hexagon backend -... --- Build files have been written to: /workspace/build-snapdragon - -[d]/workspace> cmake --build build-snapdragon -... -[144/356] Performing build step for 'htp-v73' -[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h -[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj -[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj -[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj -... --- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so --- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so -... -``` - -To generate an installable "package" simply use cmake --install: - -``` -[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp --- Install configuration: "Release" --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so -... --- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench --- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli -... -``` - -## How to Install - -For this step, your device needs to be configured for on-device development. -Please see https://developer.android.com/studio/debug/dev-options for details. - -Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device. -**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.** - -``` -~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/ -pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s) -pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s) -pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s) -102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s) -``` - -At this point, you should also install some models: - -``` -~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -... -2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920] - -~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf -Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s) -``` - -## How to Run - -The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables. - -llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4). -You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option. - -Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options. - -Here are some examples of running various llama.cpp tools via ADB. - -Simple question for Llama-3.2-1B - -``` -~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?" -... -ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 -ggml-hex: Hexagon Arch version v79 -ggml-hex: allocating new session: HTP0 -ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50 -... -load_tensors: offloading output layer to GPU -load_tensors: offloaded 17/17 layers to GPU -load_tensors: CPU model buffer size = 225.49 MiB -load_tensors: HTP0 model buffer size = 0.26 MiB -load_tensors: HTP0-REPACK model buffer size = 504.00 MiB -... -I hope this helps you understand the world's most popular cookies! [end of text] -... -llama_perf_sampler_print: sampling time = 30.08 ms / 487 runs ( 0.06 ms per token, 16191.77 tokens per second) -llama_perf_context_print: load time = 617.94 ms -llama_perf_context_print: prompt eval time = 80.76 ms / 11 tokens ( 7.34 ms per token, 136.21 tokens per second) -llama_perf_context_print: eval time = 9210.59 ms / 475 runs ( 19.39 ms per token, 51.57 tokens per second) -llama_perf_context_print: total time = 9454.92 ms / 486 tokens -llama_perf_context_print: graphs reused = 473 -llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | -llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - Host | 439 = 225 + 136 + 77 | -llama_memory_breakdown_print: | - HTP0-REPACK | 504 = 504 + 0 + 0 | -``` - -Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices - -``` -~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt -... -ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 -ggml-hex: Hexagon Arch version v81 -ggml-hex: allocating new session: HTP0 -ggml-hex: allocating new session: HTP1 -... -load_tensors: offloading output layer to GPU -load_tensors: offloaded 17/17 layers to GPU -load_tensors: CPU model buffer size = 143.86 MiB -load_tensors: HTP1 model buffer size = 0.23 MiB -load_tensors: HTP1-REPACK model buffer size = 1575.00 MiB -load_tensors: HTP0 model buffer size = 0.28 MiB -load_tensors: HTP0-REPACK model buffer size = 2025.00 MiB -... -llama_context: CPU output buffer size = 0.19 MiB -llama_kv_cache: HTP1 KV buffer size = 238.00 MiB -llama_kv_cache: HTP0 KV buffer size = 306.00 MiB -llama_kv_cache: size = 544.00 MiB ( 8192 cells, 16 layers, 1/1 seqs), K (q8_0): 272.00 MiB, V (q8_0): 272.00 MiB -llama_context: HTP0 compute buffer size = 15.00 MiB -llama_context: HTP1 compute buffer size = 15.00 MiB -llama_context: CPU compute buffer size = 24.56 MiB -... -llama_perf_context_print: prompt eval time = 1730.57 ms / 212 tokens ( 8.16 ms per token, 122.50 tokens per second) -llama_perf_context_print: eval time = 5624.75 ms / 257 runs ( 21.89 ms per token, 45.69 tokens per second) -llama_perf_context_print: total time = 7377.33 ms / 469 tokens -llama_perf_context_print: graphs reused = 255 -llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | -llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - Host | 742 = 144 + 544 + 54 | -llama_memory_breakdown_print: | - HTP1-REPACK | 1575 = 1575 + 0 + 0 | -llama_memory_breakdown_print: | - HTP0-REPACK | 2025 = 2025 + 0 + 0 | -``` - -Op test for MUL_MAT - -``` -~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT -... -Backend 2/3: HTP0 -Device description: Hexagon -Device memory: 2048 MB (2048 MB free) -MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK -MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK -MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK - -~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64 -... -ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 -ggml-hex: Hexagon Arch version v79 -ggml-hex: allocating new session: HTP0 -ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090 -| model | size | params | backend | ngl | threads | n_batch | mmap | test | t/s | -| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: | -| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | pp128 | 169.42 ± 1.75 | -| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | tg64 | 51.54 ± 1.13 | - -build: 6a8cf8914 (6733) -``` - -## Environment variables - -- `GGML_HEXAGON_NDEV=1` - Controls the number of devices/sessions to allocate. The default is 1. - Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four. - -- `GGML_HEXAGON_NHVX=0` - Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version). - -- `GGML_HEXAGON_HOSTBUF=1` - Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers. - This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID). - -- `GGML_HEXAGON_EXPERIMENTAL=1` - Controls whether the Hexagon backend enables experimental features. - This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT). - -- `GGML_HEXAGON_VERBOSE=1` - Enables verbose logging of Ops from the backend. Example output: - - ``` - ggml-hex: HTP0 graph-compute n_nodes 2 - ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1 - ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3 - ggml-hex: HTP0 graph-compute n_nodes 1 - ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0 - ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024 - ``` - -- `GGML_HEXAGON_PROFILE=1` - Generates a host-side profile for the ggml-hexagon Ops. - -- `GGML_HEXAGON_OPMASK=0x0` - Allows enabling specific stages of the processing pipeline: - - - `0x1` Enable Op Queue (i.e., queuing Ops into NPU) - - `0x2` Enable Dynamic Quantizer (if needed for the Op) - - `0x4` Enable Op Compute (MUL_MAT, etc.) - - Examples: - - `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out - `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest - `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default) diff --git a/docs/backend/hexagon/developer.md b/docs/backend/hexagon/developer.md deleted file mode 100644 index fc4d160e9..000000000 --- a/docs/backend/hexagon/developer.md +++ /dev/null @@ -1,109 +0,0 @@ -# Hexagon backend developer details - -## Backend libraries - -The Hexagon backend consist of two parts: - - - `libggml-hexagon` - This is the regular CPU-side GGML backend library, either shared or statically linked - - - `libggml-htp-vNN` - This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels. - The correct library is selected automatically at runtime based on the HW version. - -Here is an example of the build artifacts - -``` -~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml* -pkg-adb/llama.cpp/lib/libggml-base.so -pkg-adb/llama.cpp/lib/libggml-cpu.so -pkg-adb/llama.cpp/lib/libggml-hexagon.so <<< CPU library -pkg-adb/llama.cpp/lib/libggml-htp-v73.so <<< HTP op/kernels for Hexagon v73 -pkg-adb/llama.cpp/lib/libggml-htp-v75.so -pkg-adb/llama.cpp/lib/libggml-htp-v79.so -pkg-adb/llama.cpp/lib/libggml-htp-v81.so -``` - -## Memory buffers - -Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU. -The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically -quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA). - -Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those -to the NPU at this point. - -The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4. -From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force -the repacking. - -## Large model handling - -Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB. -In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc). - -In order to map models larger than 3.5GB we need to allocate multiple devices and split the model. -For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support. -Each Hexagon device behaves like a GPU from the offload and model splitting perspective. - -Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR. - -``` -M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32 -... -LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib -ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib -GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf - -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt -... -llama_model_loader: - type f32: 289 tensors -llama_model_loader: - type q4_0: 96 tensors -llama_model_loader: - type q8_0: 2 tensors -llama_model_loader: - type mxfp4: 72 tensors -... -load_tensors: offloaded 25/25 layers to GPU -load_tensors: CPU model buffer size = 1182.09 MiB -load_tensors: HTP1 model buffer size = 6.64 MiB -load_tensors: HTP1-REPACK model buffer size = 2505.94 MiB -load_tensors: HTP3 model buffer size = 5.55 MiB -load_tensors: HTP3-REPACK model buffer size = 2088.28 MiB -load_tensors: HTP0 model buffer size = 7.75 MiB -load_tensors: HTP0-REPACK model buffer size = 2923.59 MiB -load_tensors: HTP2 model buffer size = 6.64 MiB -load_tensors: HTP2-REPACK model buffer size = 2505.94 MiB -... -llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized -llama_context: CPU output buffer size = 0.77 MiB -llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells -llama_kv_cache: HTP1 KV buffer size = 25.50 MiB -llama_kv_cache: HTP3 KV buffer size = 25.50 MiB -llama_kv_cache: HTP0 KV buffer size = 25.50 MiB -llama_kv_cache: HTP2 KV buffer size = 25.50 MiB -llama_kv_cache: size = 102.00 MiB ( 8192 cells, 12 layers, 1/1 seqs), K (q8_0): 51.00 MiB, V (q8_0): 51.00 MiB -llama_kv_cache_iswa: creating SWA KV cache, size = 256 cells -llama_kv_cache: HTP1 KV buffer size = 0.80 MiB -llama_kv_cache: HTP3 KV buffer size = 0.53 MiB -llama_kv_cache: HTP0 KV buffer size = 1.06 MiB -llama_kv_cache: HTP2 KV buffer size = 0.80 MiB -llama_kv_cache: size = 3.19 MiB ( 256 cells, 12 layers, 1/1 seqs), K (q8_0): 1.59 MiB, V (q8_0): 1.59 MiB -llama_context: HTP0 compute buffer size = 16.06 MiB -llama_context: HTP1 compute buffer size = 16.06 MiB -llama_context: HTP2 compute buffer size = 16.06 MiB -llama_context: HTP3 compute buffer size = 16.06 MiB -llama_context: CPU compute buffer size = 98.19 MiB -... -llama_perf_context_print: prompt eval time = 3843.67 ms / 197 tokens ( 19.51 ms per token, 51.25 tokens per second) -llama_perf_context_print: eval time = 1686.13 ms / 31 runs ( 54.39 ms per token, 18.39 tokens per second) -llama_perf_context_print: total time = 6266.30 ms / 228 tokens -llama_perf_context_print: graphs reused = 30 -llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | -llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - HTP2 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - HTP3 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | -llama_memory_breakdown_print: | - Host | 1476 = 1208 + 105 + 162 | -llama_memory_breakdown_print: | - HTP1-REPACK | 2505 = 2505 + 0 + 0 | -llama_memory_breakdown_print: | - HTP3-REPACK | 2088 = 2088 + 0 + 0 | -llama_memory_breakdown_print: | - HTP0-REPACK | 2923 = 2923 + 0 + 0 | -llama_memory_breakdown_print: | - HTP2-REPACK | 2505 = 2505 + 0 + 0 | -``` diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json new file mode 100644 index 000000000..4cf473d05 --- /dev/null +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -0,0 +1,66 @@ +{ + "version": 5, + "cmakeMinimumRequired": { + "major": 3, + "minor": 28, + "patch": 0 + }, + "configurePresets": [ + { + "name": "arm64-android-snapdragon", + "hidden": true, + "architecture": { "value": "arm64", "strategy": "external" }, + "toolset": { "value": "host=x86_64", "strategy": "external" }, + "cacheVariables": { + "ANDROID_ABI": "arm64-v8a", + "ANDROID_PLATFORM": "android-31", + "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", + "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", + "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", + "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", + "CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}", + "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", + "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}", + "PREBUILT_LIB_DIR": "android_aarch64", + "GGML_OPENMP": "OFF", + "GGML_LLAMAFILE": "OFF", + "GGML_OPENCL": "ON", + "GGML_HEXAGON": "ON", + "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", + "LLAMA_OPENSSL": "OFF" + } + }, + + { + "name": "arm64-windows-snapdragon", + "inherits": [ "base", "arm64-windows-llvm" ], + "cacheVariables": { + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", + "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", + "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", + "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", + "CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}", + "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", + "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}", + "PREBUILT_LIB_DIR": "windows_aarch64", + "GGML_OPENMP": "OFF", + "GGML_LLAMAFILE": "OFF", + "GGML_OPENCL": "ON", + "GGML_HEXAGON": "ON", + "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", + "LLAMA_OPENSSL": "OFF" + } + }, + + { "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] }, + { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] }, + + { "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] }, + { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] } + ] +} diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md new file mode 100644 index 000000000..8e1f37b20 --- /dev/null +++ b/docs/backend/snapdragon/README.md @@ -0,0 +1,269 @@ +# Snapdragon-based devices + +## Setup + +### Android + +The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain). +This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. + +This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop. + +``` +~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3 +[d]/> cd /workspace +``` + +Note: The rest of the **Android** build process assumes that you're running inside the toolchain container. + +### Windows On Snapdragon + +Native Windows 11 arm64 builds has the following tools dependencies: +- MS Visual Studio 2026 (Community Edition or Pro) + - MSVC arm64 standard and runtime libraries + - UCRT and Driver Kit +- LLVM core libraries and Clang compiler (winget) +- CMake, Git, Python (winget) +- Hexagon SDK Community Edition 6.4 or later (see windows.md) +- OpenCL SDK 2.3 or later (see windows.md) + +Note: The rest of the **Windows** build process assumes that you're running natively in Powershell. +Adapt below build commands accordingly. + +## How to Build + +Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets: + +``` +[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json . + +[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon +Preset CMake variables: + ANDROID_ABI="arm64-v8a" + ... + CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake" + GGML_HEXAGON="ON" + GGML_OPENCL="ON" + GGML_OPENMP="OFF" + HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2" +... +-- Including OpenCL backend +-- Including Hexagon backend +... +-- Build files have been written to: /workspace/build-snapdragon + +[d]/workspace> cmake --build build-snapdragon +... +[144/356] Performing build step for 'htp-v73' +[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h +[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj +[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj +[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj +... +-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so +-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so +... +``` + +To generate an installable "package" simply use cmake --install: + +``` +[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp +-- Install configuration: "Release" +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so +... +-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench +-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli +... +``` + +## How to Install + +### Android + +For this step, your device needs to be configured for on-device development. +Please see https://developer.android.com/studio/debug/dev-options for details. + +Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device. +**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.** + +``` +~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/ +pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s) +pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s) +pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s) +102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s) +``` + +At this point, you should also install some models: + +``` +~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf +... +2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920] + +~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf +Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s) +``` + +### Windows + +All artifacts are already installed in the `pkg-snapdragon` folder. +To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`. + +## How to Run + +The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables. + +llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4). +You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option. + +Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options. + +Here are some examples of running various llama.cpp tools via ADB. + +Simple question for Llama-3.2-1B + +``` +~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-completion.sh -p "what is the most popular cookie in the world?" +... +ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 +ggml-hex: Hexagon Arch version v79 +ggml-hex: allocating new session: HTP0 +ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50 +... +load_tensors: offloading output layer to GPU +load_tensors: offloaded 17/17 layers to GPU +load_tensors: CPU model buffer size = 225.49 MiB +load_tensors: HTP0 model buffer size = 0.26 MiB +load_tensors: HTP0-REPACK model buffer size = 504.00 MiB +... +I hope this helps you understand the world's most popular cookies! [end of text] +... +llama_perf_sampler_print: sampling time = 30.08 ms / 487 runs ( 0.06 ms per token, 16191.77 tokens per second) +llama_perf_context_print: load time = 617.94 ms +llama_perf_context_print: prompt eval time = 80.76 ms / 11 tokens ( 7.34 ms per token, 136.21 tokens per second) +llama_perf_context_print: eval time = 9210.59 ms / 475 runs ( 19.39 ms per token, 51.57 tokens per second) +llama_perf_context_print: total time = 9454.92 ms / 486 tokens +llama_perf_context_print: graphs reused = 473 +llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | +llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - Host | 439 = 225 + 136 + 77 | +llama_memory_breakdown_print: | - HTP0-REPACK | 504 = 504 + 0 + 0 | +``` + +Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices + +``` +~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-completion.sh -f surfing.txt +... +ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 +ggml-hex: Hexagon Arch version v81 +ggml-hex: allocating new session: HTP0 +ggml-hex: allocating new session: HTP1 +... +load_tensors: offloading output layer to GPU +load_tensors: offloaded 17/17 layers to GPU +load_tensors: CPU model buffer size = 143.86 MiB +load_tensors: HTP1 model buffer size = 0.23 MiB +load_tensors: HTP1-REPACK model buffer size = 1575.00 MiB +load_tensors: HTP0 model buffer size = 0.28 MiB +load_tensors: HTP0-REPACK model buffer size = 2025.00 MiB +... +llama_context: CPU output buffer size = 0.19 MiB +llama_kv_cache: HTP1 KV buffer size = 238.00 MiB +llama_kv_cache: HTP0 KV buffer size = 306.00 MiB +llama_kv_cache: size = 544.00 MiB ( 8192 cells, 16 layers, 1/1 seqs), K (q8_0): 272.00 MiB, V (q8_0): 272.00 MiB +llama_context: HTP0 compute buffer size = 15.00 MiB +llama_context: HTP1 compute buffer size = 15.00 MiB +llama_context: CPU compute buffer size = 24.56 MiB +... +llama_perf_context_print: prompt eval time = 1730.57 ms / 212 tokens ( 8.16 ms per token, 122.50 tokens per second) +llama_perf_context_print: eval time = 5624.75 ms / 257 runs ( 21.89 ms per token, 45.69 tokens per second) +llama_perf_context_print: total time = 7377.33 ms / 469 tokens +llama_perf_context_print: graphs reused = 255 +llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | +llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - Host | 742 = 144 + 544 + 54 | +llama_memory_breakdown_print: | - HTP1-REPACK | 1575 = 1575 + 0 + 0 | +llama_memory_breakdown_print: | - HTP0-REPACK | 2025 = 2025 + 0 + 0 | +``` + +Op test for MUL_MAT + +``` +~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT +... +Backend 2/3: HTP0 +Device description: Hexagon +Device memory: 2048 MB (2048 MB free) +MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK +MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK +MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK + +~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64 +... +ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1 +ggml-hex: Hexagon Arch version v79 +ggml-hex: allocating new session: HTP0 +ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090 +| model | size | params | backend | ngl | threads | n_batch | mmap | test | t/s | +| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: | +| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | pp128 | 169.42 ± 1.75 | +| llama 1B Q4_0 | 729.75 MiB | 1.24 B | HTP | 99 | 4 | 128 | 0 | tg64 | 51.54 ± 1.13 | + +build: 6a8cf8914 (6733) +``` + +## Environment variables + +- `GGML_HEXAGON_NDEV=1` + Controls the number of devices/sessions to allocate. The default is 1. + Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four. + +- `GGML_HEXAGON_NHVX=0` + Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version). + +- `GGML_HEXAGON_HOSTBUF=1` + Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers. + This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID). + +- `GGML_HEXAGON_EXPERIMENTAL=1` + Controls whether the Hexagon backend enables experimental features. + This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT). + +- `GGML_HEXAGON_VERBOSE=1` + Enables verbose logging of Ops from the backend. Example output: + + ``` + ggml-hex: HTP0 graph-compute n_nodes 2 + ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1 + ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3 + ggml-hex: HTP0 graph-compute n_nodes 1 + ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0 + ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024 + ``` + +- `GGML_HEXAGON_PROFILE=1` + Generates a host-side profile for the ggml-hexagon Ops. + +- `GGML_HEXAGON_OPMASK=0x0` + Allows enabling specific stages of the processing pipeline: + + - `0x1` Enable Op Queue (i.e., queuing Ops into NPU) + - `0x2` Enable Dynamic Quantizer (if needed for the Op) + - `0x4` Enable Op Compute (MUL_MAT, etc.) + + Examples: + + `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out + `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - NPU performs dynamic quantization and skips the rest + `GGML_HEXAGON_OPMASK=0x7 llama-completion ...` - Full queuing and processing of Ops (default) diff --git a/docs/backend/snapdragon/developer.md b/docs/backend/snapdragon/developer.md new file mode 100644 index 000000000..fc4d160e9 --- /dev/null +++ b/docs/backend/snapdragon/developer.md @@ -0,0 +1,109 @@ +# Hexagon backend developer details + +## Backend libraries + +The Hexagon backend consist of two parts: + + - `libggml-hexagon` + This is the regular CPU-side GGML backend library, either shared or statically linked + + - `libggml-htp-vNN` + This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels. + The correct library is selected automatically at runtime based on the HW version. + +Here is an example of the build artifacts + +``` +~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml* +pkg-adb/llama.cpp/lib/libggml-base.so +pkg-adb/llama.cpp/lib/libggml-cpu.so +pkg-adb/llama.cpp/lib/libggml-hexagon.so <<< CPU library +pkg-adb/llama.cpp/lib/libggml-htp-v73.so <<< HTP op/kernels for Hexagon v73 +pkg-adb/llama.cpp/lib/libggml-htp-v75.so +pkg-adb/llama.cpp/lib/libggml-htp-v79.so +pkg-adb/llama.cpp/lib/libggml-htp-v81.so +``` + +## Memory buffers + +Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU. +The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically +quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA). + +Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those +to the NPU at this point. + +The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4. +From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force +the repacking. + +## Large model handling + +Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB. +In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc). + +In order to map models larger than 3.5GB we need to allocate multiple devices and split the model. +For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support. +Each Hexagon device behaves like a GPU from the offload and model splitting perspective. + +Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR. + +``` +M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-completion.sh -f surfing.txt -n 32 +... +LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib +ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib +GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf + -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt +... +llama_model_loader: - type f32: 289 tensors +llama_model_loader: - type q4_0: 96 tensors +llama_model_loader: - type q8_0: 2 tensors +llama_model_loader: - type mxfp4: 72 tensors +... +load_tensors: offloaded 25/25 layers to GPU +load_tensors: CPU model buffer size = 1182.09 MiB +load_tensors: HTP1 model buffer size = 6.64 MiB +load_tensors: HTP1-REPACK model buffer size = 2505.94 MiB +load_tensors: HTP3 model buffer size = 5.55 MiB +load_tensors: HTP3-REPACK model buffer size = 2088.28 MiB +load_tensors: HTP0 model buffer size = 7.75 MiB +load_tensors: HTP0-REPACK model buffer size = 2923.59 MiB +load_tensors: HTP2 model buffer size = 6.64 MiB +load_tensors: HTP2-REPACK model buffer size = 2505.94 MiB +... +llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized +llama_context: CPU output buffer size = 0.77 MiB +llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells +llama_kv_cache: HTP1 KV buffer size = 25.50 MiB +llama_kv_cache: HTP3 KV buffer size = 25.50 MiB +llama_kv_cache: HTP0 KV buffer size = 25.50 MiB +llama_kv_cache: HTP2 KV buffer size = 25.50 MiB +llama_kv_cache: size = 102.00 MiB ( 8192 cells, 12 layers, 1/1 seqs), K (q8_0): 51.00 MiB, V (q8_0): 51.00 MiB +llama_kv_cache_iswa: creating SWA KV cache, size = 256 cells +llama_kv_cache: HTP1 KV buffer size = 0.80 MiB +llama_kv_cache: HTP3 KV buffer size = 0.53 MiB +llama_kv_cache: HTP0 KV buffer size = 1.06 MiB +llama_kv_cache: HTP2 KV buffer size = 0.80 MiB +llama_kv_cache: size = 3.19 MiB ( 256 cells, 12 layers, 1/1 seqs), K (q8_0): 1.59 MiB, V (q8_0): 1.59 MiB +llama_context: HTP0 compute buffer size = 16.06 MiB +llama_context: HTP1 compute buffer size = 16.06 MiB +llama_context: HTP2 compute buffer size = 16.06 MiB +llama_context: HTP3 compute buffer size = 16.06 MiB +llama_context: CPU compute buffer size = 98.19 MiB +... +llama_perf_context_print: prompt eval time = 3843.67 ms / 197 tokens ( 19.51 ms per token, 51.25 tokens per second) +llama_perf_context_print: eval time = 1686.13 ms / 31 runs ( 54.39 ms per token, 18.39 tokens per second) +llama_perf_context_print: total time = 6266.30 ms / 228 tokens +llama_perf_context_print: graphs reused = 30 +llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | +llama_memory_breakdown_print: | - HTP0 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - HTP1 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - HTP2 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - HTP3 (Hexagon) | 2048 = 2048 + ( 0 = 0 + 0 + 0) + 0 | +llama_memory_breakdown_print: | - Host | 1476 = 1208 + 105 + 162 | +llama_memory_breakdown_print: | - HTP1-REPACK | 2505 = 2505 + 0 + 0 | +llama_memory_breakdown_print: | - HTP3-REPACK | 2088 = 2088 + 0 + 0 | +llama_memory_breakdown_print: | - HTP0-REPACK | 2923 = 2923 + 0 + 0 | +llama_memory_breakdown_print: | - HTP2-REPACK | 2505 = 2505 + 0 + 0 | +``` diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md new file mode 100644 index 000000000..710ad8fdf --- /dev/null +++ b/docs/backend/snapdragon/windows.md @@ -0,0 +1,161 @@ +## Overview + +The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs. + + +In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so) +must be included in the .cat file digitally signed with a trusted certificate. + +This document covers details on how to generate personal certificate files (.pfx) and how to configure the system +to allow for test signatures (aka test-signing). + +## Install the latest Adreno OpenCL SDK + +Either use the trimmed down version (optimized for CI) from + + https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz + +Or download the complete official version from + + https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2 + +Unzip/untar the archive into +``` +c:\Qualcomm\OpenCL_SDK\2.3.2 +``` + +## Install the latest Hexagon SDK Community Edition + +Either use the trimmed down version (optimized for CI) from + + https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz + +Or download the complete official version from + + https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2 + +Unzip/untar the archive into +``` +c:\Qualcomm\Hexagon_SDK\6.4.0.2 +``` + +## Install the latest Adreno GPU driver + +Download the driver from + + https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver + +After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`) + +## Install the latest Qualcomm NPU driver + +Download the driver from + + https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND + +After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`). + +If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually. +The components are extracted into +``` +c:\QCDrivers\qcnspmcdm... +``` + +## Enable NPU driver test signatures + +Please note that the following steps are required only for the Hexagon NPU. +Adreno GPU backend does not require test signatures. + +### Enable testsigning + +Use `bcdedit` to enable test-signing +``` +> bcdedit /set TESTSIGNING ON +``` +(Secure Boot may need to be disabled for this to work) + +Make sure test-signing is enabled after reboot +``` +> bcdedit /enum +... +testsigning Yes +... +``` +For additional details see Microsoft guide at + + https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option + +### Create personal certificate + +The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be +installed as part of the MS Visual Studio. +They are typically located at +``` +c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0 +``` +(replace 10.0.26100.0 with correct version). + +To create personal self-signed certificate run the following commands (either from cmd or power-shell): +``` +> cd c:\Users\MyUser +> mkdir Certs +> cd Certs +> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer +> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx +``` +(replace `MyUser` with your username). + +Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores. +This can be done using `certlm` Certificate Manager tool. +Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the +PFX file you created above. + +For additional details see Microsoft guide at + + https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing + +Make sure to save the PFX file, you will need it for the build procedures. +Please note that the same certificate can be used for signing any number of builds. + +## Build Hexagon backend with signed HTP ops libraries + +The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms. +However, additional settings are required for generating and signing HTP Ops libraries. +``` +> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2" +> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2" +> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04" +> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx" +> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64" + +> cmake --preset arm64-windows-snapdragon -B build-wos +... +> cmake --install build-wos --prefix pkg-snapdragon +``` + +Once the build is complete HTP ops libraries will be installed like this +``` +> dir pkg-snapdragon/lib +... +-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v73.so +-a---- 1/22/2026 6:01 PM 191752 libggml-htp-v75.so +-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v79.so +-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v81.so +-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat +``` + +The .cat file, the signature and proper certicate installation can be verified with + +``` +> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat +Verifying: .\pkg-snapdragon\lib\libggml-htp.cat + +Signature Index: 0 (Primary Signature) +Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF + +Signing Certificate Chain: + Issued to: GGML.HTP.v1 +... +Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat +... +``` diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 260ad48f0..265023733 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -222,6 +222,7 @@ if (GGML_SCHED_NO_REALLOC) endif() add_library(ggml + ggml-backend-dl.cpp ggml-backend-reg.cpp) add_library(ggml::ggml ALIAS ggml) diff --git a/ggml/src/ggml-backend-dl.cpp b/ggml/src/ggml-backend-dl.cpp new file mode 100644 index 000000000..a65cf0090 --- /dev/null +++ b/ggml/src/ggml-backend-dl.cpp @@ -0,0 +1,48 @@ +#include "ggml-backend-dl.h" + +#ifdef _WIN32 + +dl_handle * dl_load_library(const fs::path & path) { + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryW(path.wstring().c_str()); + + SetErrorMode(old_mode); + + return handle; +} + +void * dl_get_sym(dl_handle * handle, const char * name) { + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, name); + + SetErrorMode(old_mode); + + return p; +} + +const char * dl_error() { + return ""; +} + +#else + +dl_handle * dl_load_library(const fs::path & path) { + dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); + return handle; +} + +void * dl_get_sym(dl_handle * handle, const char * name) { + return dlsym(handle, name); +} + +const char * dl_error() { + const char *rslt = dlerror(); + return rslt != nullptr ? rslt : ""; +} + +#endif diff --git a/ggml/src/ggml-backend-dl.h b/ggml/src/ggml-backend-dl.h new file mode 100644 index 000000000..f74b7c948 --- /dev/null +++ b/ggml/src/ggml-backend-dl.h @@ -0,0 +1,45 @@ +#pragma once + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +#endif +#include + +namespace fs = std::filesystem; + +#ifdef _WIN32 + +using dl_handle = std::remove_pointer_t; + +struct dl_handle_deleter { + void operator()(HMODULE handle) { + FreeLibrary(handle); + } +}; + +#else + +using dl_handle = void; + +struct dl_handle_deleter { + void operator()(void * handle) { + dlclose(handle); + } +}; + +#endif + +using dl_handle_ptr = std::unique_ptr; + +dl_handle * dl_load_library(const fs::path & path); +void * dl_get_sym(dl_handle * handle, const char * name); +const char * dl_error(); + diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index dd991f262..8a693f84a 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,5 +1,6 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-backend-dl.h" #include "ggml-impl.h" #include #include @@ -98,72 +99,6 @@ static std::string path_str(const fs::path & path) { } } -#ifdef _WIN32 - -using dl_handle = std::remove_pointer_t; - -struct dl_handle_deleter { - void operator()(HMODULE handle) { - FreeLibrary(handle); - } -}; - -static dl_handle * dl_load_library(const fs::path & path) { - // suppress error dialogs for missing DLLs - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - HMODULE handle = LoadLibraryW(path.wstring().c_str()); - - SetErrorMode(old_mode); - - return handle; -} - -static void * dl_get_sym(dl_handle * handle, const char * name) { - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - void * p = (void *) GetProcAddress(handle, name); - - SetErrorMode(old_mode); - - return p; -} - -static const char * dl_error() { - return ""; -} - -#else - -using dl_handle = void; - -struct dl_handle_deleter { - void operator()(void * handle) { - dlclose(handle); - } -}; - -static void * dl_load_library(const fs::path & path) { - dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); - - return handle; -} - -static void * dl_get_sym(dl_handle * handle, const char * name) { - return dlsym(handle, name); -} - -static const char * dl_error() { - const char *rslt = dlerror(); - return rslt != nullptr ? rslt : ""; -} - -#endif - -using dl_handle_ptr = std::unique_ptr; - struct ggml_backend_reg_entry { ggml_backend_reg_t reg; dl_handle_ptr handle; diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt index d58e28782..2b6919701 100644 --- a/ggml/src/ggml-hexagon/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -1,7 +1,17 @@ +file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT) +file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) + +if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}" OR NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}") + message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT and HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.") +endif() + +message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels") + include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake) include(ExternalProject) option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF) +set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate") set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)") add_library(htp_iface OBJECT @@ -25,56 +35,71 @@ else() target_link_options(htp_iface PUBLIC -ldl) endif() -link_custom_library(htp_iface cdsprpc) -link_custom_library(htp_iface rpcmem) - set(TARGET_NAME ggml-hexagon) ggml_add_backend_library(${TARGET_NAME} - ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h) + ggml-hexagon.cpp + htp-drv.cpp + htp-drv.h + libdl.h + ../../include/ggml-hexagon.h) target_link_libraries(${TARGET_NAME} PRIVATE htp_iface) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR}) -# Build HTP bits -set(HTP_CMAKE_ARGS - -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR} - -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT} - -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT} - -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG} - -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}) - -ExternalProject_Add(htp-v68 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68") - -ExternalProject_Add(htp-v69 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69") - -ExternalProject_Add(htp-v73 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73") - -ExternalProject_Add(htp-v75 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75") - -ExternalProject_Add(htp-v79 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79") - -ExternalProject_Add(htp-v81 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81") +# Build HTP skels +set(HTP_SKELS) +function(build_htp_skel V) + ExternalProject_Add(htp-${V} + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON + BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake + -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR} + -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT} + -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT} + -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG} + -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE} + -DDSP_VERSION=${V} + -DPREBUILT_LIB_DIR="toolv19_${V}") + list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so) + set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE) +endfunction() + +build_htp_skel(v68) +build_htp_skel(v69) +build_htp_skel(v73) +build_htp_skel(v75) +build_htp_skel(v79) +build_htp_skel(v81) # Install Hexagon skels required at runtime -install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so - TYPE LIB) +install(FILES ${HTP_SKELS} TYPE LIB) + +if (CMAKE_SYSTEM_NAME MATCHES Windows AND GGML_HEXAGON_HTP_CERT) + file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/arm64" WINSDK_BIN0_ARM64) + file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/x86" WINSDK_BIN0_X86) + file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/arm64" WINSDK_BIN1_ARM64) + file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/x86" WINSDK_BIN1_X86) + + set(WINSDK_PATHS ${WINSDK_BIN0_ARM64} ${WINSDK_BIN0_X86} ${WINSDK_BIN1_ARM64} ${WINSDK_BIN1_X86}) + + find_program(INF2CAT NAMES inf2cat.exe PATHS ${WINSDK_PATHS} REQUIRED) + find_program(SIGNTOOL NAMES signtool.exe PATHS ${WINSDK_PATHS} REQUIRED) + + message(STATUS "hexagon: using ${GGML_HEXAGON_HTP_CERT} to sign libggml-htp skels") + + set(LIBGGML_HTP_CAT ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp.cat) + add_custom_target(libggml-htp-cat + BYPRODUCTS ${LIBGGML_HTP_CAT} + DEPENDS libggml-htp.inf ${HTP_SKELS} + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/libggml-htp.inf ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${INF2CAT} /driver:${CMAKE_CURRENT_BINARY_DIR} /os:10_25H2_ARM64 + COMMAND ${SIGNTOOL} sign /fd sha256 /f ${GGML_HEXAGON_HTP_CERT} ${LIBGGML_HTP_CAT} + COMMENT "generating and signing libggml-htp.cat file" + VERBATIM + ) + + add_dependencies(${TARGET_NAME} libggml-htp-cat) + install(FILES ${LIBGGML_HTP_CAT} TYPE LIB) +endif() diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 5b835c11c..4f0a1620f 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -14,9 +14,6 @@ #ifdef _WIN32 # include -# ifndef _WINDOWS -# define _WINDOWS -# endif #else # include # include @@ -25,8 +22,6 @@ #pragma clang diagnostic ignored "-Wnested-anon-types" #pragma clang diagnostic ignored "-Wgnu-anonymous-struct" -#include "htp-utils.h" - #include #include #include @@ -40,6 +35,7 @@ #include "op-desc.h" #include "htp-msg.h" #include "htp_iface.h" +#include "htp-drv.h" static size_t opt_ndev = 1; static size_t opt_nhvx = 0; // use all @@ -150,9 +146,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_ 0, // flags - the framework will autoset this n_bufs, // number of buffers bufs, // buffer references - sizeof(req), + sizeof(req), // Message length (const uint8_t *) &req, // Message - 1000000 // Timeout + DSPQUEUE_TIMEOUT // Timeout ); if (err != 0) { @@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() { // Read response packet from queue int err = dspqueue_read(q, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(rsp), // Max message length - &rsp_size, // Message length - (uint8_t *) &rsp, - 1000000); // Timeout + HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references + &n_bufs, // Number of buffer references + bufs, // Buffer references + sizeof(rsp), // Max message length + &rsp_size, // Message length + (uint8_t *) &rsp, // Message + DSPQUEUE_TIMEOUT); // Timeout if (err == AEE_EEXPIRED) { // TODO: might need to bail out if the HTP is stuck on something @@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context { ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) { size += 4 * 1024; // extra page for padding - if (rpcmem_alloc2) { - this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); - } else { - GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str()); - this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); - } - + this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); if (!this->base) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size); throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)"); @@ -2461,12 +2451,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) { } static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) { - return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type)); + return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type)); } static inline bool is_compute_op(ggml_tensor *node) { - return !(ggml_op_is_empty(node->op) || ggml_is_empty(node)); + return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE); } // scan the graph and figure out last compute op index @@ -2488,7 +2478,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg const int last = last_compute_op(graph); - const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer + const struct ggml_tensor * prev_op = nullptr; // prev executed op for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; @@ -2497,17 +2487,15 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg continue; } - if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { - continue; - } - uint32_t flags = 0; // skip quantizer if src1 is reused - if (op_reuse_src1(node, prev_quant_op)) { + if (op_reuse_src1(node, prev_op)) { flags |= HTP_OPFLAGS_SKIP_QUANTIZE; } + prev_op = node; + // ask for early notification for the last Op if (i == last) { flags |= HTP_OPFLAGS_EARLY_WAKEUP; @@ -2520,7 +2508,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg } else { ggml_hexagon_dispatch_op>(sess, node, flags); } - prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: if (ggml_is_quantized(node->src[0]->type)) { @@ -2528,7 +2515,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg } else { ggml_hexagon_dispatch_op>(sess, node, flags); } - prev_quant_op = node; break; case GGML_OP_MUL: case GGML_OP_ADD: @@ -2670,7 +2656,7 @@ static std::vector ggml_hexagon_graph_optimize_reorder(const std::vectorcontext = new ggml_hexagon_registry(reg); HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req), @@ -3180,6 +3170,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { + auto nErr = htpdrv_init(); + if (nErr != AEE_SUCCESS) { + return NULL; + } + ggml_hexagon_init(®); } diff --git a/ggml/src/ggml-hexagon/htp-drv.cpp b/ggml/src/ggml-hexagon/htp-drv.cpp new file mode 100644 index 000000000..2530bb06d --- /dev/null +++ b/ggml/src/ggml-hexagon/htp-drv.cpp @@ -0,0 +1,418 @@ +// sample drv interface + +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wsign-compare" + +#include +#include +#include +#include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +#endif +#include "ggml-impl.h" +#include "htp-drv.h" +#include "libdl.h" + +#include + +// +// Driver API types +// + +typedef void * (*rpcmem_alloc_pfn_t)(int heapid, uint32_t flags, int size); +typedef void * (*rpcmem_alloc2_pfn_t)(int heapid, uint32_t flags, size_t size); +typedef void (*rpcmem_free_pfn_t)(void * po); +typedef int (*rpcmem_to_fd_pfn_t)(void * po); + +typedef AEEResult (*dspqueue_create_pfn_t)(int domain, + uint32_t flags, + uint32_t req_queue_size, + uint32_t resp_queue_size, + dspqueue_callback_t packet_callback, + dspqueue_callback_t error_callback, + void * callback_context, + dspqueue_t * queue); +typedef AEEResult (*dspqueue_close_pfn_t)(dspqueue_t queue); +typedef AEEResult (*dspqueue_export_pfn_t)(dspqueue_t queue, uint64_t *queue_id); +typedef AEEResult (*dspqueue_write_pfn_t)(dspqueue_t queue, uint32_t flags, + uint32_t num_buffers, + struct dspqueue_buffer *buffers, + uint32_t message_length, + const uint8_t *message, + uint32_t timeout_us); +typedef AEEResult (*dspqueue_read_pfn_t)(dspqueue_t queue, uint32_t *flags, + uint32_t max_buffers, uint32_t *num_buffers, + struct dspqueue_buffer *buffers, + uint32_t max_message_length, + uint32_t *message_length, uint8_t *message, + uint32_t timeout_us); + +typedef int (*fastrpc_mmap_pfn_t)(int domain, int fd, void *addr, int offset, size_t length, enum fastrpc_map_flags flags); +typedef int (*fastrpc_munmap_pfn_t)(int domain, int fd, void *addr, size_t length); + +typedef int (*remote_handle64_open_pfn_t)(const char* name, remote_handle64 *ph); +typedef int (*remote_handle64_invoke_pfn_t)(remote_handle64 h, uint32_t dwScalars, remote_arg *pra); +typedef int (*remote_handle64_close_pfn_t)(remote_handle h); +typedef int (*remote_handle_control_pfn_t)(uint32_t req, void* data, uint32_t datalen); +typedef int (*remote_handle64_control_pfn_t)(remote_handle64 h, uint32_t req, void* data, uint32_t datalen); +typedef int (*remote_session_control_pfn_t)(uint32_t req, void *data, uint32_t datalen); + +// +// Driver API pfns +// + +rpcmem_alloc_pfn_t rpcmem_alloc_pfn = nullptr; +rpcmem_alloc2_pfn_t rpcmem_alloc2_pfn = nullptr; +rpcmem_free_pfn_t rpcmem_free_pfn = nullptr; +rpcmem_to_fd_pfn_t rpcmem_to_fd_pfn = nullptr; + +fastrpc_mmap_pfn_t fastrpc_mmap_pfn = nullptr; +fastrpc_munmap_pfn_t fastrpc_munmap_pfn = nullptr; + +dspqueue_create_pfn_t dspqueue_create_pfn = nullptr; +dspqueue_close_pfn_t dspqueue_close_pfn = nullptr; +dspqueue_export_pfn_t dspqueue_export_pfn = nullptr; +dspqueue_write_pfn_t dspqueue_write_pfn = nullptr; +dspqueue_read_pfn_t dspqueue_read_pfn = nullptr; + +remote_handle64_open_pfn_t remote_handle64_open_pfn = nullptr; +remote_handle64_invoke_pfn_t remote_handle64_invoke_pfn = nullptr; +remote_handle64_close_pfn_t remote_handle64_close_pfn = nullptr; +remote_handle_control_pfn_t remote_handle_control_pfn = nullptr; +remote_handle64_control_pfn_t remote_handle64_control_pfn = nullptr; +remote_session_control_pfn_t remote_session_control_pfn = nullptr; + +// +// Driver API +// + +void * rpcmem_alloc(int heapid, uint32_t flags, int size) { + return rpcmem_alloc_pfn(heapid, flags, size); +} + +void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) { + if (rpcmem_alloc2_pfn) { + return rpcmem_alloc2_pfn(heapid, flags, size); + } else { + GGML_LOG_INFO("ggml-hex: rpcmem_alloc2 not found, falling back to rpcmem_alloc\n"); + return rpcmem_alloc_pfn(heapid, flags, size); + } +} + +void rpcmem_free(void * po) { + return rpcmem_free_pfn(po); +} + +int rpcmem_to_fd(void * po) { + return rpcmem_to_fd_pfn(po); +} + +HTPDRV_API int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) { + return fastrpc_mmap_pfn(domain, fd, addr, offset, length, flags); +} + +HTPDRV_API int fastrpc_munmap(int domain, int fd, void * addr, size_t length) { + return fastrpc_munmap_pfn(domain, fd, addr, length); +} + +AEEResult dspqueue_create(int domain, + uint32_t flags, + uint32_t req_queue_size, + uint32_t resp_queue_size, + dspqueue_callback_t packet_callback, + dspqueue_callback_t error_callback, + void * callback_context, + dspqueue_t * queue) { + return dspqueue_create_pfn(domain, flags, req_queue_size, resp_queue_size, packet_callback, error_callback, + callback_context, queue); +} + +AEEResult dspqueue_close(dspqueue_t queue) { + return dspqueue_close_pfn(queue); +} + +AEEResult dspqueue_export(dspqueue_t queue, uint64_t * queue_id) { + return dspqueue_export_pfn(queue, queue_id); +} + +AEEResult dspqueue_write(dspqueue_t queue, + uint32_t flags, + uint32_t num_buffers, + struct dspqueue_buffer * buffers, + uint32_t message_length, + const uint8_t * message, + uint32_t timeout_us) { + return dspqueue_write_pfn(queue, flags, num_buffers, buffers, message_length, message, timeout_us); +} + +AEEResult dspqueue_read(dspqueue_t queue, + uint32_t * flags, + uint32_t max_buffers, + uint32_t * num_buffers, + struct dspqueue_buffer * buffers, + uint32_t max_message_length, + uint32_t * message_length, + uint8_t * message, + uint32_t timeout_us) { + return dspqueue_read_pfn(queue, flags, max_buffers, num_buffers, buffers, max_message_length, message_length, + message, timeout_us); +} + +HTPDRV_API int remote_handle64_open(const char * name, remote_handle64 * ph) { + return remote_handle64_open_pfn(name, ph); +} + +HTPDRV_API int remote_handle64_invoke(remote_handle64 h, uint32_t dwScalars, remote_arg * pra) { + return remote_handle64_invoke_pfn(h, dwScalars, pra); +} + +HTPDRV_API int remote_handle64_close(remote_handle64 h) { + return remote_handle64_close_pfn(h); +} + +HTPDRV_API int remote_handle_control(uint32_t req, void * data, uint32_t datalen) { + return remote_handle_control_pfn(req, data, datalen); +} + +HTPDRV_API int remote_handle64_control(remote_handle64 h, uint32_t req, void * data, uint32_t datalen) { + return remote_handle64_control_pfn(h, req, data, datalen); +} + +HTPDRV_API int remote_session_control(uint32_t req, void * data, uint32_t datalen) { + return remote_session_control_pfn(req, data, datalen); +} + +#ifdef _WIN32 + +static std::string wstr_to_str(std::wstring_view wstr) { + std::string result; + if (wstr.empty()) { + return result; + } + auto bytes_needed = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, + wstr.data(), (int) wstr.size(), + nullptr, 0, nullptr, nullptr); + if (bytes_needed == 0) { + GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError()); + throw std::runtime_error("Invalid wstring input"); + } + + result.resize(bytes_needed, '\0'); + int bytes_written = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, + wstr.data(), (int) wstr.size(), + result.data(), bytes_needed, + nullptr, nullptr); + if (bytes_written == 0) { + GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError()); + throw std::runtime_error("Wstring conversion failed"); + } + return result; +} + +static std::string get_driver_path() { + std::wstring serviceName = L"qcnspmcdm"; + std::string result; + + // Get a handle to the SCM database. + SC_HANDLE schSCManager = OpenSCManagerW(NULL, NULL, STANDARD_RIGHTS_READ); + if (nullptr == schSCManager) { + GGML_LOG_ERROR("ggml-hex: Failed to open SCManager. Error: %lu\n", GetLastError()); + return result; + } + + // Get a handle to the service. + SC_HANDLE schService = OpenServiceW(schSCManager, // SCM database + serviceName.c_str(), // name of service + SERVICE_QUERY_CONFIG); // need query config access + + if (nullptr == schService) { + GGML_LOG_ERROR("ggml-hex: Failed to open qcnspmcdm service. Error: %lu\n", GetLastError()); + CloseServiceHandle(schSCManager); + return result; + } + + // Store the size of buffer used as an output. + DWORD bufferSize; + if (!QueryServiceConfigW(schService, NULL, 0, &bufferSize) && + (GetLastError() != ERROR_INSUFFICIENT_BUFFER)) { + GGML_LOG_ERROR("ggml-hex: Failed to query service config. Error: %lu\n", GetLastError()); + CloseServiceHandle(schService); + CloseServiceHandle(schSCManager); + return result; + } + // Get the configuration of the service. + LPQUERY_SERVICE_CONFIGW serviceConfig = + static_cast(LocalAlloc(LMEM_FIXED, bufferSize)); + if (!QueryServiceConfigW(schService, serviceConfig, bufferSize, &bufferSize)) { + fprintf(stderr, "ggml-hex: Failed to query service config. Error: %lu\n", GetLastError()); + LocalFree(serviceConfig); + CloseServiceHandle(schService); + CloseServiceHandle(schSCManager); + return result; + } + + // Read the driver file path get its parent directory + std::wstring driverPath = std::wstring(serviceConfig->lpBinaryPathName); + driverPath = driverPath.substr(0, driverPath.find_last_of(L"\\")); + + // Clean up resources + LocalFree(serviceConfig); + CloseServiceHandle(schService); + CloseServiceHandle(schSCManager); + + // Driver path would contain invalid path string, like: + // \SystemRoot\System32\DriverStore\FileRepository\qcadsprpc8280.inf_arm64_c2b9460c9a072f37 + // "\SystemRoot" should be replace with a correct one (e.g. C:\Windows) + const std::wstring systemRootPlaceholder = L"\\SystemRoot"; + if (0 != driverPath.compare(0, systemRootPlaceholder.length(), systemRootPlaceholder)) { + GGML_LOG_ERROR("ggml-hex: String pattern not found in driver path.\n"); + return result; + } + + // Replace \SystemRoot with an absolute path from system ENV windir + const std::wstring systemRootEnv = L"windir"; + + // Query the number of wide charactors this variable requires + DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0); + if (numWords == 0) { + GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n"); + return result; + } + + // Query the actual system root name from environment variable + std::vector systemRoot(numWords + 1); + numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), systemRoot.data(), numWords + 1); + if (numWords == 0) { + GGML_LOG_ERROR("ggml-hex: Failed to read windir environment variable\n"); + return result; + } + driverPath.replace(0, systemRootPlaceholder.length(), std::wstring(systemRoot.data())); + + return wstr_to_str(driverPath); +} + +#endif + +using dl_handle_ptr = std::unique_ptr; + +int htpdrv_init() { + static dl_handle_ptr lib_cdsp_rpc_handle = nullptr; + static bool initialized = false; +#ifdef _WIN32 + std::string drv_path = get_driver_path() + "\\" + "libcdsprpc.dll"; +#else + std::string drv_path = "libcdsprpc.so"; +#endif + if (initialized) { + GGML_LOG_INFO("ggml-hex: Driver already loaded\n"); + return AEE_SUCCESS; + } + GGML_LOG_INFO("ggml-hex: Loading driver %s\n", drv_path.c_str()); + + fs::path path{ drv_path.c_str() }; + dl_handle_ptr handle { dl_load_library(path) }; + if (!handle) { + GGML_LOG_ERROR("ggml-hex: failed to load %s: %s\n", path.u8string().c_str(), dl_error()); + return AEE_EUNABLETOLOAD; + } + +#define dlsym(drv, type, pfn, symbol, ignore) \ + do { \ + pfn = (type) dl_get_sym(drv, #symbol); \ + if (!ignore && nullptr == pfn) { \ + GGML_LOG_ERROR("ggml-hex: failed to dlsym %s\n", #symbol); \ + return AEE_EUNABLETOLOAD; \ + } \ + } while (0) + + dlsym(handle.get(), rpcmem_alloc_pfn_t, rpcmem_alloc_pfn, rpcmem_alloc, false); + dlsym(handle.get(), rpcmem_alloc2_pfn_t, rpcmem_alloc2_pfn, rpcmem_alloc2, true); + dlsym(handle.get(), rpcmem_free_pfn_t, rpcmem_free_pfn, rpcmem_free, false); + dlsym(handle.get(), rpcmem_to_fd_pfn_t, rpcmem_to_fd_pfn, rpcmem_to_fd, false); + dlsym(handle.get(), fastrpc_mmap_pfn_t, fastrpc_mmap_pfn, fastrpc_mmap, false); + dlsym(handle.get(), fastrpc_munmap_pfn_t, fastrpc_munmap_pfn, fastrpc_munmap, false); + dlsym(handle.get(), dspqueue_create_pfn_t, dspqueue_create_pfn, dspqueue_create, false); + dlsym(handle.get(), dspqueue_close_pfn_t, dspqueue_close_pfn, dspqueue_close, false); + dlsym(handle.get(), dspqueue_export_pfn_t, dspqueue_export_pfn, dspqueue_export, false); + dlsym(handle.get(), dspqueue_write_pfn_t, dspqueue_write_pfn, dspqueue_write, false); + dlsym(handle.get(), dspqueue_read_pfn_t, dspqueue_read_pfn, dspqueue_read, false); + dlsym(handle.get(), remote_handle64_open_pfn_t, remote_handle64_open_pfn, remote_handle64_open, false); + dlsym(handle.get(), remote_handle64_invoke_pfn_t, remote_handle64_invoke_pfn, remote_handle64_invoke, false); + dlsym(handle.get(), remote_handle_control_pfn_t, remote_handle_control_pfn, remote_handle_control, false); + dlsym(handle.get(), remote_handle64_control_pfn_t, remote_handle64_control_pfn, remote_handle64_control, false); + dlsym(handle.get(), remote_session_control_pfn_t, remote_session_control_pfn, remote_session_control, false); + dlsym(handle.get(), remote_handle64_close_pfn_t, remote_handle64_close_pfn, remote_handle64_close, false); + + lib_cdsp_rpc_handle = std::move(handle); + initialized = true; + + return AEE_SUCCESS; +} + +domain * get_domain(int domain_id) { + int i = 0; + int size = sizeof(supported_domains) / sizeof(domain); + + for (i = 0; i < size; i++) { + if (supported_domains[i].id == domain_id) { + return &supported_domains[i]; + } + } + + return NULL; +} + +int get_hex_arch_ver(int domain, int * arch) { + if (!remote_handle_control_pfn) { + GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n"); + return AEE_EUNSUPPORTEDAPI; + } + + struct remote_dsp_capability arch_ver; + arch_ver.domain = (uint32_t) domain; + arch_ver.attribute_ID = ARCH_VER; + arch_ver.capability = (uint32_t) 0; + + int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver)); + if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) { + GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n"); + return AEE_EUNSUPPORTEDAPI; + } + + if (err != AEE_SUCCESS) { + GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err); + return err; + } + + switch (arch_ver.capability & 0xff) { + case 0x68: + *arch = 68; + return 0; + case 0x69: + *arch = 69; + return 0; + case 0x73: + *arch = 73; + return 0; + case 0x75: + *arch = 75; + return 0; + case 0x79: + *arch = 79; + return 0; + case 0x81: + *arch = 81; + return 0; + } + return -1; +} diff --git a/ggml/src/ggml-hexagon/htp-drv.h b/ggml/src/ggml-hexagon/htp-drv.h new file mode 100644 index 000000000..6eba7ba17 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp-drv.h @@ -0,0 +1,121 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +# pragma clang diagnostic ignored "-Wignored-attributes" +#endif + +#include +#include +#include +#include + +#if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BACKEND_BUILD +# define HTPDRV_API __declspec(dllexport) extern +# else +# define HTPDRV_API __declspec(dllimport) extern +# endif +#else +# define HTPDRV_API __attribute__ ((visibility ("default"))) extern +#endif + +/* Offset to differentiate HLOS and Hexagon error codes. + Stores the value of AEE_EOFFSET for Hexagon. */ +#ifndef DSP_OFFSET +# define DSP_OFFSET 0x80000400 +#endif + +/* Errno for connection reset by peer. */ +#ifndef ECONNRESET +# ifdef __hexagon__ +# define ECONNRESET 104 +# endif +#endif + +/* Abstraction of different OS specific sleep APIs. + SLEEP accepts input in seconds. */ +#ifndef SLEEP +# ifdef __hexagon__ +# define SLEEP(x) \ + { /* Do nothing for simulator. */ \ + } +# else +# ifdef _WIN32 +# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */ +# else +# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */ +# endif +# endif +#endif + +/* Include windows specific header files. */ +#ifdef _WIN32 +# include +# include +# define _CRT_SECURE_NO_WARNINGS 1 +# define _WINSOCK_DEPRECATED_NO_WARNINGS 1 +#endif + +/* Includes and defines for all HLOS except windows */ +#if !defined(__hexagon__) && !defined(_WIN32) +# include "unistd.h" + +# include +#endif + +/* Includes and defines for Hexagon and all HLOS except Windows. */ +#if !defined(_WIN32) +/* Weak reference to remote symbol for compilation. */ +# pragma weak remote_session_control +# pragma weak remote_handle_control +# pragma weak remote_handle64_control +# pragma weak fastrpc_mmap +# pragma weak fastrpc_munmap +# pragma weak rpcmem_alloc2 +#endif + +#if !defined(_WIN32) +# pragma weak remote_system_request +#endif + +#ifdef _WIN32 +# define DSPQUEUE_TIMEOUT DSPQUEUE_TIMEOUT_NONE +#else +# define DSPQUEUE_TIMEOUT 1000000 +#endif + +/** + * htpdrv_init API: driver interface entry point + * + * @return Return AEE error codes as defined in Hexagon SDK. + */ +HTPDRV_API int htpdrv_init(void); + +/** + * get_domain API: get domain struct from domain value. + * + * @param[in] domain value of a domain + * @return Returns domain struct of the domain if it is supported or else + * returns NULL. + * + */ +HTPDRV_API domain * get_domain(int domain_id); + +/** + * get_hex_arch_ver API: query the Hexagon processor architecture version information + * + * @param[in] domain_id value of a domain + * @param[out] Arch version (73, 75, ...) + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + * + */ +HTPDRV_API int get_hex_arch_ver(int domain, int * arch); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-hexagon/htp-utils.c b/ggml/src/ggml-hexagon/htp-utils.c deleted file mode 100644 index 3f335bf71..000000000 --- a/ggml/src/ggml-hexagon/htp-utils.c +++ /dev/null @@ -1,454 +0,0 @@ - -#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" -#pragma clang diagnostic ignored "-Wmissing-prototypes" -#pragma clang diagnostic ignored "-Wsign-compare" - -#define GGML_COMMON_IMPL_C -#include "ggml-backend-impl.h" -#include "ggml-common.h" -#include "ggml-hexagon.h" -#include "ggml-impl.h" - -#include "htp-utils.h" - -#include -#include -#include -#include -#include -#include -#include - -domain * get_domain(int domain_id) { - int i = 0; - int size = sizeof(supported_domains) / sizeof(domain); - - for (i = 0; i < size; i++) { - if (supported_domains[i].id == domain_id) { - return &supported_domains[i]; - } - } - - return NULL; -} - -bool is_valid_domain_id(int domain_id, int compute_only) { - int i = 0; - int size = sizeof(supported_domains) / sizeof(domain); - - if (compute_only) { - return is_CDSP(domain_id); - } - - for (i = 0; i < size; i++) { - if (supported_domains[i].id == domain_id) { - return true; - } - } - - return false; -} - -int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) { - int nErr = AEE_SUCCESS; - int ss_info = 0; - if (domain_type != NULL) { - if (strcmp(domain_type, "LPASS") == 0) { - ss_info = FASTRPC_LPASS; - } else if (strcmp(domain_type, "HPASS") == 0) { - ss_info = FASTRPC_HPASS; - } else { - ss_info = FASTRPC_NSP; - } - } - system_req_payload req = { 0 }; - req.id = FASTRPC_GET_DOMAINS; - req.sys.domains = NULL; - fastrpc_domain * domain = NULL; - if (ss_info != 0) { - req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info); - } else { - req.sys.flags = 0; - } -#ifdef _WIN32 - nErr = AEE_EUNSUPPORTED; - goto bail; -#endif - if (remote_system_request) { - nErr = remote_system_request(&req); - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr); - goto bail; - } - // Allocate memory for domain-info array - req.sys.max_domains = req.sys.num_domains; - if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) { - nErr = AEE_ENOMEMORY; - GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains"); - goto bail; - } - - nErr = remote_system_request(&req); - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr); - goto bail; - } - - for (int i = 0; i < req.sys.num_domains; i++) { - // Verify that only requested type domains were returned - domain = &req.sys.domains[i]; - if (domain->type != ss_info && domain_type != NULL) { - nErr = -1; - GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n"); - goto bail; - } - } - *domains_info = req.sys.domains; - *num_domains = req.sys.num_domains; - } else { - nErr = AEE_EUNSUPPORTED; - goto bail; - } -bail: - if (nErr && !req.sys.domains) { - free(req.sys.domains); - } - return nErr; -} - -int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) { - int err = 0; - remote_rpc_effective_domain_id_t sess = { 0 }; - - sess.domain_name = domain_name; - sess.domain_name_len = strlen(domain_name); - sess.session_id = session_id; - - err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess)); - if (err) { - GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name, - session_id); - return err; - } - - *effec_domain_id = sess.effective_domain_id; - return err; -} - -int get_dsp_support(int * domain) { - int nErr = AEE_SUCCESS; - *domain = CDSP_DOMAIN_ID; // DSP domain default value is CDSP_DOMAIN_ID - - if (remote_handle_control) { - struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 }; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - goto bail; - } - - if (dsp_capability_domain.capability == 0) { - dsp_capability_domain.domain = ADSP_DOMAIN_ID; // Check for ADSP support. - dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT; - dsp_capability_domain.capability = 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, - sizeof(struct remote_dsp_capability)); - if (dsp_capability_domain.capability) { - *domain = ADSP_DOMAIN_ID; // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID - } - } - - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} - -int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) { - int nErr = AEE_SUCCESS; - *capability = 0; - - if (attr == VTCM_PAGE || attr == VTCM_COUNT) { - } else { - nErr = AEE_EBADPARM; - GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n"); - goto bail; - } - if (remote_handle_control) { - if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for VTCM information - * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0 - */ - struct remote_dsp_capability dsp_capability_vtcm_dsp; - dsp_capability_vtcm_dsp.domain = (uint32_t) domain; - dsp_capability_vtcm_dsp.attribute_ID = attr; - dsp_capability_vtcm_dsp.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (nErr == AEE_SUCCESS) { - *capability = dsp_capability_vtcm_dsp.capability; - } else { - GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("Unsupported domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} - -bool is_unsignedpd_supported(int domain_id) { - int nErr = AEE_SUCCESS; - if (remote_handle_control) { - struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 }; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n"); - return false; - } - if (nErr) { - GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr); - return false; - } - if (dsp_capability_domain.capability == 1) { - return true; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n"); - return false; - } - return false; -} - -bool get_unsignedpd_support(void) { - return is_unsignedpd_supported(CDSP_DOMAIN_ID); -} - -bool is_async_fastrpc_supported(int domain) { - int nErr = AEE_SUCCESS; - if (remote_handle_control) { - if (domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for ASYNC_FASTRPC_SUPPORT information - * Async fastrpc is supported only on CDSP - */ - struct remote_dsp_capability dsp_capability_async_support; - dsp_capability_async_support.domain = (uint32_t) domain; - dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT; - dsp_capability_async_support.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (dsp_capability_async_support.capability == 1) { - return true; - } - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return false; -} - -bool is_status_notification_supported(int domain) { - int nErr = AEE_SUCCESS; - - if (remote_handle_control) { - /* - * Query the DSP for STATUS_NOTIFICATION_SUPPORT information - * DSP User PD status notification Support - */ - struct remote_dsp_capability dsp_capability_status_notification_support; - dsp_capability_status_notification_support.domain = (uint32_t) domain; - dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT; - dsp_capability_status_notification_support.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (dsp_capability_status_notification_support.capability == 1) { - return true; - } - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return false; -} - -int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) { - int nErr = AEE_SUCCESS; - *capability = 0; - - if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) { - nErr = AEE_EBADPARM; - GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n"); - goto bail; - } - if (remote_handle_control) { - if (domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for HMX SUPPORT information - * HMX is supported on CDSP only - */ - struct remote_dsp_capability dsp_capability_hmx_dsp; - dsp_capability_hmx_dsp.domain = (uint32_t) domain; - dsp_capability_hmx_dsp.attribute_ID = attr; - dsp_capability_hmx_dsp.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (nErr == AEE_SUCCESS) { - *capability = dsp_capability_hmx_dsp.capability; - } else { - GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} - -int get_hex_arch_ver(int domain, int * arch) { - if (!remote_handle_control) { - GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n"); - return AEE_EUNSUPPORTEDAPI; - } - - struct remote_dsp_capability arch_ver; - arch_ver.domain = (uint32_t) domain; - arch_ver.attribute_ID = ARCH_VER; - arch_ver.capability = (uint32_t) 0; - - int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver)); - if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) { - GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n"); - return AEE_EUNSUPPORTEDAPI; - } - - if (err != AEE_SUCCESS) { - GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err); - return err; - } - - switch (arch_ver.capability & 0xff) { - case 0x68: - *arch = 68; - return 0; - case 0x69: - *arch = 69; - return 0; - case 0x73: - *arch = 73; - return 0; - case 0x75: - *arch = 75; - return 0; - case 0x79: - *arch = 79; - return 0; - case 0x81: - *arch = 81; - return 0; - } - return -1; -} - -int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) { - int nErr = AEE_SUCCESS; - *capability = 0; - - if (remote_handle_control) { - if (domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for HVX SUPPORT information - * HVX is supported on CDSP only - */ - struct remote_dsp_capability dsp_capability_hvx_dsp; - dsp_capability_hvx_dsp.domain = (uint32_t) domain; - dsp_capability_hvx_dsp.attribute_ID = attr; - dsp_capability_hvx_dsp.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (nErr == AEE_SUCCESS) { - *capability = dsp_capability_hvx_dsp.capability; - } else { - GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h deleted file mode 100644 index 7bbae3a0b..000000000 --- a/ggml/src/ggml-hexagon/htp-utils.h +++ /dev/null @@ -1,221 +0,0 @@ -#ifndef HTP_UTILS_H -#define HTP_UTILS_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include -#include - -/* Offset to differentiate HLOS and Hexagon error codes. - Stores the value of AEE_EOFFSET for Hexagon. */ -#ifndef DSP_OFFSET -# define DSP_OFFSET 0x80000400 -#endif - -/* Errno for connection reset by peer. */ -#ifndef ECONNRESET -# ifdef __hexagon__ -# define ECONNRESET 104 -# endif -#endif - -/* Abstraction of different OS specific sleep APIs. - SLEEP accepts input in seconds. */ -#ifndef SLEEP -# ifdef __hexagon__ -# define SLEEP(x) \ - { /* Do nothing for simulator. */ \ - } -# else -# ifdef _WINDOWS -# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */ -# else -# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */ -# endif -# endif -#endif - -/* Include windows specific header files. */ -#ifdef _WINDOWS -# include -# include -# define _CRT_SECURE_NO_WARNINGS 1 -# define _WINSOCK_DEPRECATED_NO_WARNINGS 1 -/* Including this file for custom implementation of getopt function. */ -# include "getopt_custom.h" -#endif - -/* Includes and defines for all HLOS except windows */ -#if !defined(__hexagon__) && !defined(_WINDOWS) -# include "unistd.h" - -# include -#endif - -/* Includes and defines for Hexagon and all HLOS except Windows. */ -#if !defined(_WINDOWS) -/* Weak reference to remote symbol for compilation. */ -# pragma weak remote_session_control -# pragma weak remote_handle_control -# pragma weak remote_handle64_control -# pragma weak fastrpc_mmap -# pragma weak fastrpc_munmap -# pragma weak rpcmem_alloc2 -#endif - -#if !defined(_WINDOWS) -# pragma weak remote_system_request -#endif -/** - * Wrapper for FastRPC Capability API: query DSP support. - * - * @param[out] domain pointer to supported domain. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - */ -int get_dsp_support(int * domain); - -/** - * Wrapper for FastRPC Capability API: query VTCM information. - * - * @param[in] domain value of domain in the queried. - * @param[out] capability capability value of the attribute queried. - * @param[in] attr value of the attribute to the queried. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - */ -int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr); - -/** - * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain. - * - * @return true if unsigned pd is supported. - * false if unsigned pd is not supported, capability query failed. - */ - -bool get_unsignedpd_support(void); - -/** - * Wrapper for FastRPC Capability API: query unsigned pd support. - * - * @param[in] domain value of domain in the queried. - * @return true if unsigned pd is supported. - * false if unsigned pd is not supported, capability query failed. - */ - -bool is_unsignedpd_supported(int domain_id); - -/** - * is_valid_domain_id API: query a domain id is valid. - * - * @param[in] domain value of domain in the queried. - * @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled. - * @return true if value of domain is valid. - * false if value of domain is not valid. - */ - -bool is_valid_domain_id(int domain_id, int compute_only); - -/** - * get_domain API: get domain struct from domain value. - * - * @param[in] domain value of a domain - * @return Returns domain struct of the domain if it is supported or else - * returns NULL. - * - */ - -domain * get_domain(int domain_id); - -/** - * get_domains_info API: get information for all the domains available on the device - * - * @param[in] domain_type pointer to domain type - * @param[in] num_domains pointer to number of domains - * @param[in] domains_info pointer to save discovered domains information. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application. - * - */ - -int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info); - -/** - * get_effective_domain_id API: get effective domain id for given session id - * - * @param[in] domain_name pointer to domain name - * @param[in] session_id - * @param[in] effec_domain_id pointer to save obtained effective domain id. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ - -int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id); - -/** - * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not - * - * @param[in] domain_id value of a domain - * @return Returns true or false stating support of Async FastRPC - * - */ - -bool is_async_fastrpc_supported(int domain_id); - -/** - * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information - * - * @param[in] domain_id value of a domain - * @return Returns true or false stating status notification support information - * - */ -bool is_status_notification_supported(int domain_id); - -/** - * get_hmx_support_info API: query the DSP for HMX SUPPORT information - * - * @param[in] domain_id value of a domain - * @param[out] capability capability value of the attribute queried. - * @param[in] attr value of the attribute to the queried. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ -int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr); - -/** - * get_hex_arch_ver API: query the Hexagon processor architecture version information - * - * @param[in] domain_id value of a domain - * @param[out] Arch version (73, 75, ...) - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ -int get_hex_arch_ver(int domain, int * arch); - -/** - * get_hvx_support_info API: query the DSP for HVX SUPPORT information - * - * @param[in] domain_id value of a domain - * @param[out] capability capability value of the attribute queried. - * @param[in] attr value of the attribute to the queried. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ -int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr); - -#ifdef __cplusplus -} -#endif - -#endif //DSP_CAPABILITIES_UTILS_H diff --git a/ggml/src/ggml-hexagon/libdl.h b/ggml/src/ggml-hexagon/libdl.h new file mode 100644 index 000000000..8ca5016f0 --- /dev/null +++ b/ggml/src/ggml-hexagon/libdl.h @@ -0,0 +1,79 @@ +#pragma once + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +#endif +#include + +namespace fs = std::filesystem; + +#ifdef _WIN32 + +using dl_handle = std::remove_pointer_t; + +struct dl_handle_deleter { + void operator()(HMODULE handle) { + FreeLibrary(handle); + } +}; + +static inline dl_handle * dl_load_library(const fs::path & path) { + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryW(path.wstring().c_str()); + + SetErrorMode(old_mode); + + return handle; +} + +static inline void * dl_get_sym(dl_handle * handle, const char * name) { + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, name); + + SetErrorMode(old_mode); + + return p; +} + +static inline const char * dl_error() { + return ""; +} + +#else + +using dl_handle = void; + +struct dl_handle_deleter { + void operator()(void * handle) { + dlclose(handle); + } +}; + +static inline dl_handle * dl_load_library(const fs::path & path) { + dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); + return handle; +} + +static inline void * dl_get_sym(dl_handle * handle, const char * name) { + return dlsym(handle, name); +} + +static inline const char * dl_error() { + const char *rslt = dlerror(); + return rslt != nullptr ? rslt : ""; +} + +#endif diff --git a/ggml/src/ggml-hexagon/libggml-htp.inf b/ggml/src/ggml-hexagon/libggml-htp.inf new file mode 100644 index 000000000..656d2d9ab --- /dev/null +++ b/ggml/src/ggml-hexagon/libggml-htp.inf @@ -0,0 +1,38 @@ +[Version] +Signature = "$WINDOWS NT$" +Class = ComputeAccelerator +ClassGuid = {F01A9D53-3FF6-48D2-9F97-C8A7004BE10C} +Provider = %GGML% +DriverVer = 01/01/2026,1.0.0.0 +CatalogFile = libggml-htp.cat +PnpLockDown = 1 + +[DestinationDirs] +Drivers_Dir = 6 + +[SourceDisksNames] +1 = %DiskId% + +[SourceDisksFiles] +libggml-htp-v68.so = 1 +libggml-htp-v69.so = 1 +libggml-htp-v73.so = 1 +libggml-htp-v75.so = 1 +libggml-htp-v81.so = 1 + +[ControlFlags] +ExcludeFromSelect = * + +[DefaultInstall.NTarm64] +CopyFiles=Drivers_Dir + +[Drivers_Dir] +libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE + +[Strings] +GGML = 'GGML' +DiskId = 'GGML HTP library' diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1 new file mode 100644 index 000000000..21fd063eb --- /dev/null +++ b/scripts/snapdragon/windows/run-bench.ps1 @@ -0,0 +1,40 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +$cli_opts=$args + +$model="Llama-3.2-3B-Instruct-Q4_0.gguf" +if ($null -ne $env:M) { + $model=$env:M +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\llama-bench.exe" ` + --mmap 0 -m $basedir\..\..\gguf\$model ` + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` + --batch-size 128 -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1 new file mode 100644 index 000000000..b13161aa6 --- /dev/null +++ b/scripts/snapdragon/windows/run-cli.ps1 @@ -0,0 +1,53 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +$cli_opts=$args + +$model="Llama-3.2-3B-Instruct-Q4_0.gguf" +if ($null -ne $env:M) { + $model=$env:M +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:SCHED) { + $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v" +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\llama-completion.exe" ` + --no-mmap -no-cnv -m $basedir\..\..\gguf\$model ` + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` + --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on ` + -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-tool.ps1 b/scripts/snapdragon/windows/run-tool.ps1 new file mode 100644 index 000000000..70094af9b --- /dev/null +++ b/scripts/snapdragon/windows/run-tool.ps1 @@ -0,0 +1,56 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +if ($args.Count -eq 0) { + Write-Host "No arguments provided.Expected the tool and argument to run." + exit -1 +} + +$tool=$args[0] +$cli_opts=@() + +if ($args.Count -gt 1) { + $cli_opts=$args[1..($args.Count - 1)] + $remainingArgs = $args[1..($args.Count - 1)] +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:SCHED) { + $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v" +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\$tool" ` + $cli_opts diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1 new file mode 100644 index 000000000..0f3244cc9 --- /dev/null +++ b/scripts/snapdragon/windows/setup-build.ps1 @@ -0,0 +1,105 @@ +# Requires Run as Administrator is NOT strictly necessary for User-scope env vars, +# but recommended for creating directories in C:\ root if permissions are restricted. + +$ErrorActionPreference = "Stop" + +# --- Configuration --- +$BaseDir = "C:\Qualcomm" + +# SDK 1: Hexagon +$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz" +$HexagonParent = Join-Path $BaseDir "Hexagon_SDK" +$HexagonSdkVersion = "6.4.0.2" +$HexagonToolsVersion = "19.0.04" +$HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion +$HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion" + +# SDK 2: OpenCL +$OpenCLUrl = "https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz" +$OpenCLParent = Join-Path $BaseDir "OpenCL_SDK" +$OpenCLVersion = "2.3.2" +$OpenCLTarget = Join-Path $OpenCLParent $OpenCLVersion + +# --- Helper Function --- +function Install-QualcommSDK { + param ( + [string]$Url, + [string]$ParentDir, + [string]$TargetDir, + [string]$Name + ) + + # 1. Create Parent Directory + if (-not (Test-Path -Path $ParentDir)) { + Write-Host "Creating directory: $ParentDir" -ForegroundColor Cyan + New-Item -Path $ParentDir -ItemType Directory -Force | Out-Null + } + + # 2. Check for Specific Version Directory + if (Test-Path -Path $TargetDir) { + Write-Host "$Name ($TargetDir) already exists. Skipping download." -ForegroundColor Green + } + else { + Write-Host "$Name not found. preparing to download..." -ForegroundColor Yellow + + # Create the target directory to extract into + New-Item -Path $TargetDir -ItemType Directory -Force | Out-Null + + # Define temporary archive path + $TempFile = Join-Path $ParentDir "temp_sdk.tar.xz" + + try { + # Download + Write-Host "Downloading from: $Url" + Invoke-WebRequest -Uri $Url -OutFile $TempFile + + # Untar + # Note: We assume Windows includes tar.exe (Win 10 build 17063+) + Write-Host "Extracting archive to $TargetDir..." + + # We use -C to extract contents INTO the target directory created above + tar -xJvf $TempFile -C $TargetDir\.. + + Write-Host "Extraction complete." -ForegroundColor Green + } + catch { + Write-Error "Failed to download or extract $Name. Error: $_" + # Cleanup target dir if failed so script tries again next time + Remove-Item -Path $TargetDir -Recurse -Force -ErrorAction SilentlyContinue + } + finally { + # Cleanup Archive + if (Test-Path $TempFile) { Remove-Item $TempFile -Force } + } + } +} + +# --- Execution --- + +# 1. Ensure Base C:\Qualcomm exists +if (-not (Test-Path $BaseDir)) { + New-Item -Path $BaseDir -ItemType Directory -Force | Out-Null +} + +# 2. Run Install Logic +Install-QualcommSDK -Url $HexagonUrl -ParentDir $HexagonParent -TargetDir $HexagonSdkTarget -Name "Hexagon SDK" +Install-QualcommSDK -Url $OpenCLUrl -ParentDir $OpenCLParent -TargetDir $OpenCLTarget -Name "OpenCL SDK" + +# --- Environment Variables --- + +Write-Host "`nSetting Environment Variables..." -ForegroundColor Cyan + +# Set OPENCL_SDK_ROOT +[System.Environment]::SetEnvironmentVariable('OPENCL_SDK_ROOT', $OpenCLTarget, [System.EnvironmentVariableTarget]::User) +$env:OPENCL_SDK_ROOT = $OpenCLTarget # Set for current session as well +Write-Host "OPENCL_SDK_ROOT set to: $OpenCLTarget" + +# Set HEXAGON_SDK_ROOT +[System.Environment]::SetEnvironmentVariable('HEXAGON_SDK_ROOT', $HexagonSdkTarget, [System.EnvironmentVariableTarget]::User) +$env:HEXAGON_SDK_ROOT = $HexagonSdkTarget # Set for current session as well +Write-Host "HEXAGON_SDK_ROOT set to: $HexagonSdkTarget" + +# Set HEXAGON_SDK_ROOT +[System.Environment]::SetEnvironmentVariable('HEXAGON_TOOLS_ROOT', $HexagonToolsTarget, [System.EnvironmentVariableTarget]::User) +$env:HEXAGON_TOOLS_ROOT = $HexagonToolsTarget # Set for current session as well +Write-Host "HEXAGON_TOOLS_ROOT set to: $HexagonToolsTarget"