--- /dev/null
+name: Server-Metal
+
+on:
+ workflow_dispatch: # allows manual triggering
+ inputs:
+ sha:
+ description: 'Commit SHA1 to build'
+ required: false
+ type: string
+ slow_tests:
+ description: 'Run slow tests'
+ required: true
+ type: boolean
+ push:
+ branches:
+ - master
+ paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+
+env:
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ server-metal:
+ runs-on: [self-hosted, macOS, ARM64]
+
+ name: server-metal (${{ matrix.wf_name }})
+ strategy:
+ matrix:
+ build_type: [Release]
+ wf_name: ["GPUx1"]
+ include:
+ - build_type: Release
+ extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+ wf_name: "GPUx1, backend-sampling"
+ - build_type: Release
+ extra_args: "GGML_METAL_DEVICES=2"
+ wf_name: "GPUx2"
+ - build_type: Release
+ extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
+ wf_name: "GPUx2, backend-sampling"
+ fail-fast: false
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v6
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Build
+ id: cmake_build
+ run: |
+ cmake -B build -DGGML_SCHED_NO_REALLOC=ON
+ cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
+
+ - name: Tests
+ id: server_integration_tests
+ if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+ run: |
+ cd tools/server/tests
+ python3 -m venv venv
+ source venv/bin/activate
+ pip install -r requirements.txt
+ export ${{ matrix.extra_args }}
+ pytest -v -x -m "not slow"