From: Georgi Gerganov Date: Thu, 15 Aug 2024 07:11:11 +0000 (+0300) Subject: ci : disable bench workflow (#9010) X-Git-Tag: upstream/0.0.4488~901 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=d5492f0525fa533817a67e93a4bde9d71d81cf58;p=pkg%2Fggml%2Fsources%2Fllama.cpp ci : disable bench workflow (#9010) --- diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml deleted file mode 100644 index 56d22bc0..00000000 --- a/.github/workflows/bench.yml +++ /dev/null @@ -1,312 +0,0 @@ -# Benchmark -name: Benchmark - -on: - workflow_dispatch: - inputs: - gpu-series: - description: 'Azure GPU series to run with' - required: true - type: choice - options: - - Standard_NC4as_T4_v3 - - Standard_NC24ads_A100_v4 - - Standard_NC80adis_H100_v5 - sha: - description: 'Commit SHA1 to build' - required: false - type: string - duration: - description: 'Duration of the bench' - type: string - default: 10m - - push: - branches: - - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] - pull_request_target: - types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] - schedule: - - cron: '04 2 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} - cancel-in-progress: true - -jobs: - bench-server-baseline: - runs-on: Standard_NC4as_T4_v3 - env: - RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it - N_USERS: 8 - DURATION: 10m - - strategy: - matrix: - model: [phi-2] - ftype: [q4_0, q8_0, f16] - include: - - model: phi-2 - ftype: q4_0 - pr_comment_enabled: "true" - - if: | - inputs.gpu-series == 'Standard_NC4as_T4_v3' - || ( - github.event_name == 'schedule' - && github.ref_name == 'master' - && github.repository_owner == 'ggerganov' - ) - || github.event_name == 'pull_request_target' - || ( - github.event_name == 'push' - && github.event.ref == 'refs/heads/master' - && github.repository_owner == 'ggerganov' - ) - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Install python env - id: pipenv - run: | - cd examples/server/bench - python3 -m venv venv - source venv/bin/activate - pip install -r requirements.txt - - - name: Prometheus - id: install_prometheus - run: | - wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz - tar xzf prometheus*.tar.gz --strip-components=1 - ./prometheus --config.file=examples/server/bench/prometheus.yml & - while ! nc -z localhost 9090; do - sleep 0.1 - done - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.21' - - - name: Install k6 and xk6-sse - id: k6_installation - run: | - cd examples/server/bench - go install go.k6.io/xk6/cmd/xk6@latest - xk6 build master \ - --with github.com/phymbert/xk6-sse - - - name: Build - id: cmake_build - run: | - set -eux - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DLLAMA_CUBLAS=ON \ - -DCUDAToolkit_ROOT=/usr/local/cuda \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ - -DCMAKE_CUDA_ARCHITECTURES=75 \ - -DLLAMA_FATAL_WARNINGS=OFF \ - -DLLAMA_ALL_WARNINGS=OFF \ - -DCMAKE_BUILD_TYPE=Release; - cmake --build build --config Release -j $(nproc) --target llama-server - - - name: Download the dataset - id: download_dataset - run: | - cd examples/server/bench - wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - - name: Server bench - id: server_bench - env: - HEAD_REF: ${{ github.head_ref || github.ref_name }} - run: | - set -eux - - cd examples/server/bench - source venv/bin/activate - python bench.py \ - --runner-label ${{ env.RUNNER_LABEL }} \ - --name ${{ github.job }} \ - --branch $HEAD_REF \ - --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ - --scenario script.js \ - --duration ${{ github.event.inputs.duration || env.DURATION }} \ - --hf-repo ggml-org/models \ - --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ - --model-path-prefix /models \ - --parallel ${{ env.N_USERS }} \ - -ngl 33 \ - --batch-size 2048 \ - --ubatch-size 256 \ - --ctx-size 16384 \ - --n-prompts 1000 \ - --max-prompt-tokens 1024 \ - --max-tokens 2048 - - cat results.github.env >> $GITHUB_ENV - - # Remove dataset as we do not want it in the artefact - rm ShareGPT_V3_unfiltered_cleaned_split.json - - - uses: actions/upload-artifact@v4 - with: - name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - compression-level: 9 - path: | - examples/server/bench/*.jpg - examples/server/bench/*.json - examples/server/bench/*.log - - - name: Commit status - uses: Sibz/github-status-action@v1 - with: - authToken: ${{secrets.GITHUB_TOKEN}} - sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} - context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - description: | - ${{ env.BENCH_RESULTS }} - state: 'success' - - - name: Upload benchmark images - uses: devicons/public-upload-to-imgur@v2.2.2 - continue-on-error: true # Important as it looks unstable: 503 - id: imgur_step - with: - client_id: ${{secrets.IMGUR_CLIENT_ID}} - path: | - examples/server/bench/prompt_tokens_seconds.jpg - examples/server/bench/predicted_tokens_seconds.jpg - examples/server/bench/kv_cache_usage_ratio.jpg - examples/server/bench/requests_processing.jpg - - - name: Extract mermaid - id: set_mermaid - run: | - set -eux - - cd examples/server/bench - PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) - echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) - echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) - echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV - echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - REQUESTS_PROCESSING=$(cat requests_processing.mermaid) - echo "REQUESTS_PROCESSING<> $GITHUB_ENV - echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - - name: Extract image url - id: extract_image_url - continue-on-error: true - run: | - set -eux - - echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV - echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV - echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV - echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV - - - name: Comment PR - uses: mshick/add-pr-comment@v2 - id: comment_pr - if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} - with: - message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - message: | -

- - 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - -

- -
- - Expand details for performance related PR only - - - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - - ${{ env.BENCH_GRAPH_XLABEL }} - - -

- - prompt_tokens_seconds - -

- - More - - ```mermaid - ${{ env.PROMPT_TOKENS_SECONDS }} - ``` - -
- - predicted_tokens_seconds - -
- More - - ```mermaid - ${{ env.PREDICTED_TOKENS_SECONDS }} - ``` - -
- -

- -
- - Details - -

- - kv_cache_usage_ratio - -

- More - - ```mermaid - ${{ env.KV_CACHE_USAGE_RATIO }} - ``` - -
- - requests_processing - -
- More - - ```mermaid - ${{ env.REQUESTS_PROCESSING }} - ``` - -
- -

-
-
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled new file mode 100644 index 00000000..bfdbb4ef --- /dev/null +++ b/.github/workflows/bench.yml.disabled @@ -0,0 +1,315 @@ +# TODO: there have been some issues with the workflow, so disabling for now +# https://github.com/ggerganov/llama.cpp/issues/7893 +# +# Benchmark +name: Benchmark + +on: + workflow_dispatch: + inputs: + gpu-series: + description: 'Azure GPU series to run with' + required: true + type: choice + options: + - Standard_NC4as_T4_v3 + - Standard_NC24ads_A100_v4 + - Standard_NC80adis_H100_v5 + sha: + description: 'Commit SHA1 to build' + required: false + type: string + duration: + description: 'Duration of the bench' + type: string + default: 10m + + push: + branches: + - master + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + pull_request_target: + types: [opened, synchronize, reopened] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + schedule: + - cron: '04 2 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} + cancel-in-progress: true + +jobs: + bench-server-baseline: + runs-on: Standard_NC4as_T4_v3 + env: + RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it + N_USERS: 8 + DURATION: 10m + + strategy: + matrix: + model: [phi-2] + ftype: [q4_0, q8_0, f16] + include: + - model: phi-2 + ftype: q4_0 + pr_comment_enabled: "true" + + if: | + inputs.gpu-series == 'Standard_NC4as_T4_v3' + || ( + github.event_name == 'schedule' + && github.ref_name == 'master' + && github.repository_owner == 'ggerganov' + ) + || github.event_name == 'pull_request_target' + || ( + github.event_name == 'push' + && github.event.ref == 'refs/heads/master' + && github.repository_owner == 'ggerganov' + ) + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install python env + id: pipenv + run: | + cd examples/server/bench + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + + - name: Prometheus + id: install_prometheus + run: | + wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz + tar xzf prometheus*.tar.gz --strip-components=1 + ./prometheus --config.file=examples/server/bench/prometheus.yml & + while ! nc -z localhost 9090; do + sleep 0.1 + done + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + + - name: Install k6 and xk6-sse + id: k6_installation + run: | + cd examples/server/bench + go install go.k6.io/xk6/cmd/xk6@latest + xk6 build master \ + --with github.com/phymbert/xk6-sse + + - name: Build + id: cmake_build + run: | + set -eux + cmake -B build \ + -DGGML_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DLLAMA_CUBLAS=ON \ + -DCUDAToolkit_ROOT=/usr/local/cuda \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=75 \ + -DLLAMA_FATAL_WARNINGS=OFF \ + -DLLAMA_ALL_WARNINGS=OFF \ + -DCMAKE_BUILD_TYPE=Release; + cmake --build build --config Release -j $(nproc) --target llama-server + + - name: Download the dataset + id: download_dataset + run: | + cd examples/server/bench + wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + - name: Server bench + id: server_bench + env: + HEAD_REF: ${{ github.head_ref || github.ref_name }} + run: | + set -eux + + cd examples/server/bench + source venv/bin/activate + python bench.py \ + --runner-label ${{ env.RUNNER_LABEL }} \ + --name ${{ github.job }} \ + --branch $HEAD_REF \ + --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ + --scenario script.js \ + --duration ${{ github.event.inputs.duration || env.DURATION }} \ + --hf-repo ggml-org/models \ + --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ + --model-path-prefix /models \ + --parallel ${{ env.N_USERS }} \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 16384 \ + --n-prompts 1000 \ + --max-prompt-tokens 1024 \ + --max-tokens 2048 + + cat results.github.env >> $GITHUB_ENV + + # Remove dataset as we do not want it in the artefact + rm ShareGPT_V3_unfiltered_cleaned_split.json + + - uses: actions/upload-artifact@v4 + with: + name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + compression-level: 9 + path: | + examples/server/bench/*.jpg + examples/server/bench/*.json + examples/server/bench/*.log + + - name: Commit status + uses: Sibz/github-status-action@v1 + with: + authToken: ${{secrets.GITHUB_TOKEN}} + sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} + context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + description: | + ${{ env.BENCH_RESULTS }} + state: 'success' + + - name: Upload benchmark images + uses: devicons/public-upload-to-imgur@v2.2.2 + continue-on-error: true # Important as it looks unstable: 503 + id: imgur_step + with: + client_id: ${{secrets.IMGUR_CLIENT_ID}} + path: | + examples/server/bench/prompt_tokens_seconds.jpg + examples/server/bench/predicted_tokens_seconds.jpg + examples/server/bench/kv_cache_usage_ratio.jpg + examples/server/bench/requests_processing.jpg + + - name: Extract mermaid + id: set_mermaid + run: | + set -eux + + cd examples/server/bench + PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) + echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) + echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) + echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV + echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + REQUESTS_PROCESSING=$(cat requests_processing.mermaid) + echo "REQUESTS_PROCESSING<> $GITHUB_ENV + echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Extract image url + id: extract_image_url + continue-on-error: true + run: | + set -eux + + echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV + echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV + echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV + echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV + + - name: Comment PR + uses: mshick/add-pr-comment@v2 + id: comment_pr + if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} + with: + message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + message: | +

+ + 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + +

+ +
+ + Expand details for performance related PR only + + - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} + - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s + - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s + - ${{ env.BENCH_GRAPH_XLABEL }} + + +

+ + prompt_tokens_seconds + +

+ + More + + ```mermaid + ${{ env.PROMPT_TOKENS_SECONDS }} + ``` + +
+ + predicted_tokens_seconds + +
+ More + + ```mermaid + ${{ env.PREDICTED_TOKENS_SECONDS }} + ``` + +
+ +

+ +
+ + Details + +

+ + kv_cache_usage_ratio + +

+ More + + ```mermaid + ${{ env.KV_CACHE_USAGE_RATIO }} + ``` + +
+ + requests_processing + +
+ More + + ```mermaid + ${{ env.REQUESTS_PROCESSING }} + ``` + +
+ +

+
+