runs-on: ubuntu-24.04
env:
- # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
--- /dev/null
+name: CI (self-hosted)
+
+on:
+ workflow_dispatch: # allows manual triggering
+ push:
+ branches:
+ - master
+ paths: [
+ '.github/workflows/build.yml',
+ '**/CMakeLists.txt',
+ '**/.cmake',
+ '**/*.h',
+ '**/*.hpp',
+ '**/*.c',
+ '**/*.cpp',
+ '**/*.cu',
+ '**/*.cuh',
+ '**/*.swift',
+ '**/*.m',
+ '**/*.metal',
+ '**/*.comp',
+ '**/*.glsl',
+ '**/*.wgsl'
+ ]
+
+ pull_request:
+ types: [opened, synchronize, reopened]
+ paths: [
+ '.github/workflows/build-self-hosted.yml',
+ '**/CMakeLists.txt',
+ '**/.cmake',
+ '**/*.h',
+ '**/*.hpp',
+ '**/*.c',
+ '**/*.cpp',
+ '**/*.cu',
+ '**/*.cuh',
+ '**/*.swift',
+ '**/*.m',
+ '**/*.metal',
+ '**/*.comp',
+ '**/*.glsl',
+ '**/*.wgsl'
+ ]
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+ cancel-in-progress: true
+
+env:
+ GGML_NLOOP: 3
+ GGML_N_THREADS: 1
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+ ggml-ci-nvidia-cuda:
+ runs-on: [self-hosted, Linux, NVIDIA]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ nvidia-smi
+ GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+ ggml-ci-nvidia-vulkan-cm:
+ runs-on: [self-hosted, Linux, NVIDIA]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ vulkaninfo --summary
+ GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+ ggml-ci-nvidia-vulkan-cm2:
+ runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ vulkaninfo --summary
+ GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+ ggml-ci-cpu-amx:
+ runs-on: [self-hosted, Linux, CPU, AMX]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+ # ggml-ci-amd-vulkan:
+ # runs-on: [self-hosted, Linux, AMD]
+
+ # steps:
+ # - name: Clone
+ # id: checkout
+ # uses: actions/checkout@v6
+
+ # - name: Test
+ # id: ggml-ci
+ # run: |
+ # vulkaninfo --summary
+ # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+ # ggml-ci-amd-rocm:
+ # runs-on: [self-hosted, Linux, AMD]
+
+ # steps:
+ # - name: Clone
+ # id: checkout
+ # uses: actions/checkout@v6
+
+ # - name: Test
+ # id: ggml-ci
+ # run: |
+ # amd-smi static
+ # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+ ggml-ci-mac-metal:
+ runs-on: [self-hosted, macOS, ARM64]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+ ggml-ci-mac-webgpu:
+ runs-on: [self-hosted, macOS, ARM64]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Dawn Dependency
+ id: dawn-depends
+ run: |
+ DAWN_VERSION="v2.0.0"
+ DAWN_OWNER="reeselevine"
+ DAWN_REPO="dawn"
+ DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+ echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+ curl -L -o artifact.zip \
+ "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+ mkdir dawn
+ unzip artifact.zip
+ tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+ ggml-ci-mac-vulkan:
+ runs-on: [self-hosted, macOS, ARM64]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ vulkaninfo --summary
+ GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+ ggml-ci-linux-intel-vulkan:
+ runs-on: [self-hosted, Linux, Intel]
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ persist-credentials: false
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ vulkaninfo --summary
+ GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+ ggml-ci-intel-openvino-gpu-low-perf:
+ runs-on: [self-hosted, Linux, Intel, OpenVINO]
+
+ env:
+ # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ OPENVINO_VERSION_MAJOR: "2026.0"
+ OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+
+ - name: Use OpenVINO Toolkit Cache
+ uses: actions/cache@v5
+ id: cache-openvino
+ with:
+ path: ./openvino_toolkit
+ key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+ - name: Setup OpenVINO Toolkit
+ if: steps.cache-openvino.outputs.cache-hit != 'true'
+ uses: ./.github/actions/linux-setup-openvino
+ with:
+ path: ./openvino_toolkit
+ version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+ version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+ - name: Install OpenVINO dependencies
+ run: |
+ cd ./openvino_toolkit
+ chmod +x ./install_dependencies/install_openvino_dependencies.sh
+ echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+ - name: Test
+ id: ggml-ci
+ run: |
+ source ./openvino_toolkit/setupvars.sh
+ GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
runs-on: ${{ fromJSON(matrix.runner) }}
env:
- # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
- ggml-ci-x64-nvidia-cuda:
- runs-on: [self-hosted, Linux, X64, NVIDIA]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Test
- id: ggml-ci
- run: |
- nvidia-smi
- GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
- ggml-ci-x64-nvidia-vulkan-cm:
- runs-on: [self-hosted, Linux, X64, NVIDIA]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Test
- id: ggml-ci
- run: |
- vulkaninfo --summary
- GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
- ggml-ci-x64-nvidia-vulkan-cm2:
- runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Test
- id: ggml-ci
- run: |
- vulkaninfo --summary
- GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
- ggml-ci-x64-cpu-amx:
- runs-on: [self-hosted, Linux, X64, CPU, AMX]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Test
- id: ggml-ci
- run: |
- bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
- # ggml-ci-x64-amd-vulkan:
- # runs-on: [self-hosted, Linux, X64, AMD]
-
- # steps:
- # - name: Clone
- # id: checkout
- # uses: actions/checkout@v6
-
- # - name: Test
- # id: ggml-ci
- # run: |
- # vulkaninfo --summary
- # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
- # ggml-ci-x64-amd-rocm:
- # runs-on: [self-hosted, Linux, X64, AMD]
-
- # steps:
- # - name: Clone
- # id: checkout
- # uses: actions/checkout@v6
-
- # - name: Test
- # id: ggml-ci
- # run: |
- # amd-smi static
- # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
- ggml-ci-mac-metal:
- runs-on: [self-hosted, macOS, ARM64]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Test
- id: ggml-ci
- run: |
- GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
- ggml-ci-mac-webgpu:
- runs-on: [self-hosted, macOS, ARM64]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Dawn Dependency
- id: dawn-depends
- run: |
- DAWN_VERSION="v2.0.0"
- DAWN_OWNER="reeselevine"
- DAWN_REPO="dawn"
- DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
- echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
- curl -L -o artifact.zip \
- "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
- mkdir dawn
- unzip artifact.zip
- tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
-
- - name: Test
- id: ggml-ci
- run: |
- GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
- bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
- ggml-ci-mac-vulkan:
- runs-on: [self-hosted, macOS, ARM64]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Test
- id: ggml-ci
- run: |
- vulkaninfo --summary
- GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
- ggml-ci-x64-linux-intel-vulkan:
- runs-on: [self-hosted, Linux, X64, Intel]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
- with:
- persist-credentials: false
-
- - name: Test
- id: ggml-ci
- run: |
- vulkaninfo --summary
- GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
ggml-ci-arm64-cpu-kleidiai:
runs-on: ubuntu-22.04-arm
run: |
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
- ggml-ci-x64-intel-openvino-gpu-low-perf:
- runs-on: [self-hosted, Linux, X64, Intel, OpenVINO]
-
- env:
- # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
- OPENVINO_VERSION_MAJOR: "2026.0"
- OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
-
- - name: Use OpenVINO Toolkit Cache
- uses: actions/cache@v5
- id: cache-openvino
- with:
- path: ./openvino_toolkit
- key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
- - name: Setup OpenVINO Toolkit
- if: steps.cache-openvino.outputs.cache-hit != 'true'
- uses: ./.github/actions/linux-setup-openvino
- with:
- path: ./openvino_toolkit
- version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
- version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
- - name: Install OpenVINO dependencies
- run: |
- cd ./openvino_toolkit
- chmod +x ./install_dependencies/install_openvino_dependencies.sh
- echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
- - name: Test
- id: ggml-ci
- run: |
- source ./openvino_toolkit/setupvars.sh
- GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
ubuntu-cpu-cmake-riscv64-native:
runs-on: RISCV64
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
- # Sync versions in build.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+ # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+++ /dev/null
-name: Server-Metal
-
-on:
- workflow_dispatch: # allows manual triggering
- inputs:
- sha:
- description: 'Commit SHA1 to build'
- required: false
- type: string
- slow_tests:
- description: 'Run slow tests'
- required: true
- type: boolean
- push:
- branches:
- - master
- paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
-
-env:
- LLAMA_LOG_COLORS: 1
- LLAMA_LOG_PREFIX: 1
- LLAMA_LOG_TIMESTAMPS: 1
- LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
- cancel-in-progress: true
-
-jobs:
- server-metal:
- runs-on: [self-hosted, macOS, ARM64]
-
- name: server-metal (${{ matrix.wf_name }})
- strategy:
- matrix:
- build_type: [Release]
- wf_name: ["GPUx1"]
- include:
- - build_type: Release
- extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
- wf_name: "GPUx1, backend-sampling"
- - build_type: Release
- extra_args: "GGML_METAL_DEVICES=2"
- wf_name: "GPUx2"
- - build_type: Release
- extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
- wf_name: "GPUx2, backend-sampling"
- fail-fast: false
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v6
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Build
- id: cmake_build
- run: |
- cmake -B build -DGGML_SCHED_NO_REALLOC=ON
- cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-
- - name: Tests
- id: server_integration_tests
- if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
- run: |
- cd tools/server/tests
- python3 -m venv venv
- source venv/bin/activate
- pip install -r requirements.txt
- export ${{ matrix.extra_args }}
- pytest -v -x -m "not slow"
--- /dev/null
+name: Server (self-hosted)
+
+on:
+ workflow_dispatch: # allows manual triggering
+ inputs:
+ sha:
+ description: 'Commit SHA1 to build'
+ required: false
+ type: string
+ slow_tests:
+ description: 'Run slow tests'
+ required: true
+ type: boolean
+ push:
+ branches:
+ - master
+ paths: ['.github/workflows/server-self-hosted.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+
+env:
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ server-metal:
+ runs-on: [self-hosted, llama-server, macOS, ARM64]
+
+ name: server-metal (${{ matrix.wf_name }})
+ strategy:
+ matrix:
+ build_type: [Release]
+ wf_name: ["GPUx1"]
+ include:
+ - build_type: Release
+ extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+ wf_name: "GPUx1, backend-sampling"
+ - build_type: Release
+ extra_args: "GGML_METAL_DEVICES=2"
+ wf_name: "GPUx2"
+ - build_type: Release
+ extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
+ wf_name: "GPUx2, backend-sampling"
+ fail-fast: false
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build -DGGML_SCHED_NO_REALLOC=ON
+ cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
+
+ - name: Tests
+ id: server_integration_tests
+ if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+ run: |
+ cd tools/server/tests
+ python3 -m venv venv
+ source venv/bin/activate
+ pip install -r requirements.txt
+ export ${{ matrix.extra_args }}
+ pytest -v -x -m "not slow"
+
+ server-cuda:
+ runs-on: [self-hosted, llama-server, Linux, NVIDIA]
+
+ name: server-cuda (${{ matrix.wf_name }})
+ strategy:
+ matrix:
+ build_type: [Release]
+ wf_name: ["GPUx1"]
+ include:
+ - build_type: Release
+ extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+ wf_name: "GPUx1, backend-sampling"
+ fail-fast: false
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build -DGGML_SCHED_NO_REALLOC=ON
+ cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
+
+ - name: Tests
+ id: server_integration_tests
+ if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+ run: |
+ cd tools/server/tests
+ python3 -m venv venv
+ source venv/bin/activate
+ pip install -r requirements.txt
+ export ${{ matrix.extra_args }}
+ pytest -v -x -m "not slow"