+++ /dev/null
-# Benchmark
-name: Benchmark
-
-on:
- workflow_dispatch:
- inputs:
- gpu-series:
- description: 'Azure GPU series to run with'
- required: true
- type: choice
- options:
- - Standard_NC4as_T4_v3
- - Standard_NC24ads_A100_v4
- - Standard_NC80adis_H100_v5
- sha:
- description: 'Commit SHA1 to build'
- required: false
- type: string
- duration:
- description: 'Duration of the bench'
- type: string
- default: 10m
-
- push:
- branches:
- - master
- paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
- pull_request_target:
- types: [opened, synchronize, reopened]
- paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
- schedule:
- - cron: '04 2 * * *'
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
- cancel-in-progress: true
-
-jobs:
- bench-server-baseline:
- runs-on: Standard_NC4as_T4_v3
- env:
- RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
- N_USERS: 8
- DURATION: 10m
-
- strategy:
- matrix:
- model: [phi-2]
- ftype: [q4_0, q8_0, f16]
- include:
- - model: phi-2
- ftype: q4_0
- pr_comment_enabled: "true"
-
- if: |
- inputs.gpu-series == 'Standard_NC4as_T4_v3'
- || (
- github.event_name == 'schedule'
- && github.ref_name == 'master'
- && github.repository_owner == 'ggerganov'
- )
- || github.event_name == 'pull_request_target'
- || (
- github.event_name == 'push'
- && github.event.ref == 'refs/heads/master'
- && github.repository_owner == 'ggerganov'
- )
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
- with:
- fetch-depth: 0
- ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
- - name: Install python env
- id: pipenv
- run: |
- cd examples/server/bench
- python3 -m venv venv
- source venv/bin/activate
- pip install -r requirements.txt
-
- - name: Prometheus
- id: install_prometheus
- run: |
- wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
- tar xzf prometheus*.tar.gz --strip-components=1
- ./prometheus --config.file=examples/server/bench/prometheus.yml &
- while ! nc -z localhost 9090; do
- sleep 0.1
- done
-
- - name: Set up Go
- uses: actions/setup-go@v5
- with:
- go-version: '1.21'
-
- - name: Install k6 and xk6-sse
- id: k6_installation
- run: |
- cd examples/server/bench
- go install go.k6.io/xk6/cmd/xk6@latest
- xk6 build master \
- --with github.com/phymbert/xk6-sse
-
- - name: Build
- id: cmake_build
- run: |
- set -eux
- cmake -B build \
- -DGGML_NATIVE=OFF \
- -DLLAMA_BUILD_SERVER=ON \
- -DLLAMA_CURL=ON \
- -DLLAMA_CUBLAS=ON \
- -DCUDAToolkit_ROOT=/usr/local/cuda \
- -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
- -DCMAKE_CUDA_ARCHITECTURES=75 \
- -DLLAMA_FATAL_WARNINGS=OFF \
- -DLLAMA_ALL_WARNINGS=OFF \
- -DCMAKE_BUILD_TYPE=Release;
- cmake --build build --config Release -j $(nproc) --target llama-server
-
- - name: Download the dataset
- id: download_dataset
- run: |
- cd examples/server/bench
- wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
- - name: Server bench
- id: server_bench
- env:
- HEAD_REF: ${{ github.head_ref || github.ref_name }}
- run: |
- set -eux
-
- cd examples/server/bench
- source venv/bin/activate
- python bench.py \
- --runner-label ${{ env.RUNNER_LABEL }} \
- --name ${{ github.job }} \
- --branch $HEAD_REF \
- --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
- --scenario script.js \
- --duration ${{ github.event.inputs.duration || env.DURATION }} \
- --hf-repo ggml-org/models \
- --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
- --model-path-prefix /models \
- --parallel ${{ env.N_USERS }} \
- -ngl 33 \
- --batch-size 2048 \
- --ubatch-size 256 \
- --ctx-size 16384 \
- --n-prompts 1000 \
- --max-prompt-tokens 1024 \
- --max-tokens 2048
-
- cat results.github.env >> $GITHUB_ENV
-
- # Remove dataset as we do not want it in the artefact
- rm ShareGPT_V3_unfiltered_cleaned_split.json
-
- - uses: actions/upload-artifact@v4
- with:
- name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
- compression-level: 9
- path: |
- examples/server/bench/*.jpg
- examples/server/bench/*.json
- examples/server/bench/*.log
-
- - name: Commit status
- uses: Sibz/github-status-action@v1
- with:
- authToken: ${{secrets.GITHUB_TOKEN}}
- sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
- context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
- description: |
- ${{ env.BENCH_RESULTS }}
- state: 'success'
-
- - name: Upload benchmark images
- uses: devicons/public-upload-to-imgur@v2.2.2
- continue-on-error: true # Important as it looks unstable: 503
- id: imgur_step
- with:
- client_id: ${{secrets.IMGUR_CLIENT_ID}}
- path: |
- examples/server/bench/prompt_tokens_seconds.jpg
- examples/server/bench/predicted_tokens_seconds.jpg
- examples/server/bench/kv_cache_usage_ratio.jpg
- examples/server/bench/requests_processing.jpg
-
- - name: Extract mermaid
- id: set_mermaid
- run: |
- set -eux
-
- cd examples/server/bench
- PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
- echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
- echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
-
- PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
- echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
- echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
-
- KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
- echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
- echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
-
- REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
- echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
- echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
- echo "EOF" >> $GITHUB_ENV
-
- - name: Extract image url
- id: extract_image_url
- continue-on-error: true
- run: |
- set -eux
-
- echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
- echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
- echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
- echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
- - name: Comment PR
- uses: mshick/add-pr-comment@v2
- id: comment_pr
- if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
- with:
- message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
- message: |
- <p align="center">
-
- 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
- </p>
-
- <details>
-
- <summary>Expand details for performance related PR only</summary>
-
- - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
- <p align="center">
-
- <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
- <details>
-
- <summary>More</summary>
-
- ```mermaid
- ${{ env.PROMPT_TOKENS_SECONDS }}
- ```
-
- </details>
-
- <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
- <details>
- <summary>More</summary>
-
- ```mermaid
- ${{ env.PREDICTED_TOKENS_SECONDS }}
- ```
-
- </details>
-
- </p>
-
- <details>
-
- <summary>Details</summary>
-
- <p align="center">
-
- <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
- <details>
- <summary>More</summary>
-
- ```mermaid
- ${{ env.KV_CACHE_USAGE_RATIO }}
- ```
-
- </details>
-
- <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
- <details>
- <summary>More</summary>
-
- ```mermaid
- ${{ env.REQUESTS_PROCESSING }}
- ```
-
- </details>
-
- </p>
- </details>
- </details>
--- /dev/null
+# TODO: there have been some issues with the workflow, so disabling for now
+# https://github.com/ggerganov/llama.cpp/issues/7893
+#
+# Benchmark
+name: Benchmark
+
+on:
+ workflow_dispatch:
+ inputs:
+ gpu-series:
+ description: 'Azure GPU series to run with'
+ required: true
+ type: choice
+ options:
+ - Standard_NC4as_T4_v3
+ - Standard_NC24ads_A100_v4
+ - Standard_NC80adis_H100_v5
+ sha:
+ description: 'Commit SHA1 to build'
+ required: false
+ type: string
+ duration:
+ description: 'Duration of the bench'
+ type: string
+ default: 10m
+
+ push:
+ branches:
+ - master
+ paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+ pull_request_target:
+ types: [opened, synchronize, reopened]
+ paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+ schedule:
+ - cron: '04 2 * * *'
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
+ cancel-in-progress: true
+
+jobs:
+ bench-server-baseline:
+ runs-on: Standard_NC4as_T4_v3
+ env:
+ RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
+ N_USERS: 8
+ DURATION: 10m
+
+ strategy:
+ matrix:
+ model: [phi-2]
+ ftype: [q4_0, q8_0, f16]
+ include:
+ - model: phi-2
+ ftype: q4_0
+ pr_comment_enabled: "true"
+
+ if: |
+ inputs.gpu-series == 'Standard_NC4as_T4_v3'
+ || (
+ github.event_name == 'schedule'
+ && github.ref_name == 'master'
+ && github.repository_owner == 'ggerganov'
+ )
+ || github.event_name == 'pull_request_target'
+ || (
+ github.event_name == 'push'
+ && github.event.ref == 'refs/heads/master'
+ && github.repository_owner == 'ggerganov'
+ )
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+ - name: Install python env
+ id: pipenv
+ run: |
+ cd examples/server/bench
+ python3 -m venv venv
+ source venv/bin/activate
+ pip install -r requirements.txt
+
+ - name: Prometheus
+ id: install_prometheus
+ run: |
+ wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
+ tar xzf prometheus*.tar.gz --strip-components=1
+ ./prometheus --config.file=examples/server/bench/prometheus.yml &
+ while ! nc -z localhost 9090; do
+ sleep 0.1
+ done
+
+ - name: Set up Go
+ uses: actions/setup-go@v5
+ with:
+ go-version: '1.21'
+
+ - name: Install k6 and xk6-sse
+ id: k6_installation
+ run: |
+ cd examples/server/bench
+ go install go.k6.io/xk6/cmd/xk6@latest
+ xk6 build master \
+ --with github.com/phymbert/xk6-sse
+
+ - name: Build
+ id: cmake_build
+ run: |
+ set -eux
+ cmake -B build \
+ -DGGML_NATIVE=OFF \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DLLAMA_CURL=ON \
+ -DLLAMA_CUBLAS=ON \
+ -DCUDAToolkit_ROOT=/usr/local/cuda \
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+ -DCMAKE_CUDA_ARCHITECTURES=75 \
+ -DLLAMA_FATAL_WARNINGS=OFF \
+ -DLLAMA_ALL_WARNINGS=OFF \
+ -DCMAKE_BUILD_TYPE=Release;
+ cmake --build build --config Release -j $(nproc) --target llama-server
+
+ - name: Download the dataset
+ id: download_dataset
+ run: |
+ cd examples/server/bench
+ wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+ - name: Server bench
+ id: server_bench
+ env:
+ HEAD_REF: ${{ github.head_ref || github.ref_name }}
+ run: |
+ set -eux
+
+ cd examples/server/bench
+ source venv/bin/activate
+ python bench.py \
+ --runner-label ${{ env.RUNNER_LABEL }} \
+ --name ${{ github.job }} \
+ --branch $HEAD_REF \
+ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
+ --scenario script.js \
+ --duration ${{ github.event.inputs.duration || env.DURATION }} \
+ --hf-repo ggml-org/models \
+ --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
+ --model-path-prefix /models \
+ --parallel ${{ env.N_USERS }} \
+ -ngl 33 \
+ --batch-size 2048 \
+ --ubatch-size 256 \
+ --ctx-size 16384 \
+ --n-prompts 1000 \
+ --max-prompt-tokens 1024 \
+ --max-tokens 2048
+
+ cat results.github.env >> $GITHUB_ENV
+
+ # Remove dataset as we do not want it in the artefact
+ rm ShareGPT_V3_unfiltered_cleaned_split.json
+
+ - uses: actions/upload-artifact@v4
+ with:
+ name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+ compression-level: 9
+ path: |
+ examples/server/bench/*.jpg
+ examples/server/bench/*.json
+ examples/server/bench/*.log
+
+ - name: Commit status
+ uses: Sibz/github-status-action@v1
+ with:
+ authToken: ${{secrets.GITHUB_TOKEN}}
+ sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
+ context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+ description: |
+ ${{ env.BENCH_RESULTS }}
+ state: 'success'
+
+ - name: Upload benchmark images
+ uses: devicons/public-upload-to-imgur@v2.2.2
+ continue-on-error: true # Important as it looks unstable: 503
+ id: imgur_step
+ with:
+ client_id: ${{secrets.IMGUR_CLIENT_ID}}
+ path: |
+ examples/server/bench/prompt_tokens_seconds.jpg
+ examples/server/bench/predicted_tokens_seconds.jpg
+ examples/server/bench/kv_cache_usage_ratio.jpg
+ examples/server/bench/requests_processing.jpg
+
+ - name: Extract mermaid
+ id: set_mermaid
+ run: |
+ set -eux
+
+ cd examples/server/bench
+ PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
+ echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+ echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
+ echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
+ echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
+ echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
+ echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
+ echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
+ echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ - name: Extract image url
+ id: extract_image_url
+ continue-on-error: true
+ run: |
+ set -eux
+
+ echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
+ echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
+ echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
+ echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
+
+ - name: Comment PR
+ uses: mshick/add-pr-comment@v2
+ id: comment_pr
+ if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
+ with:
+ message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
+ message: |
+ <p align="center">
+
+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+ </p>
+
+ <details>
+
+ <summary>Expand details for performance related PR only</summary>
+
+ - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+ - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+ - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
+ - ${{ env.BENCH_GRAPH_XLABEL }}
+
+
+ <p align="center">
+
+ <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
+
+ <details>
+
+ <summary>More</summary>
+
+ ```mermaid
+ ${{ env.PROMPT_TOKENS_SECONDS }}
+ ```
+
+ </details>
+
+ <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
+
+ <details>
+ <summary>More</summary>
+
+ ```mermaid
+ ${{ env.PREDICTED_TOKENS_SECONDS }}
+ ```
+
+ </details>
+
+ </p>
+
+ <details>
+
+ <summary>Details</summary>
+
+ <p align="center">
+
+ <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
+
+ <details>
+ <summary>More</summary>
+
+ ```mermaid
+ ${{ env.KV_CACHE_USAGE_RATIO }}
+ ```
+
+ </details>
+
+ <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
+
+ <details>
+ <summary>More</summary>
+
+ ```mermaid
+ ${{ env.REQUESTS_PROCESSING }}
+ ```
+
+ </details>
+
+ </p>
+ </details>
+ </details>