llama.cpp/.github/workflows/bench.yml

# Benchmark
name: Benchmark

on:
  workflow_dispatch:
    inputs:
      gpu-series:
        description: 'Azure GPU series to run with'
        required: true
        type: choice
        options:
          - Standard_NC4as_T4_v3
          - Standard_NC24ads_A100_v4
          - Standard_NC80adis_H100_v5
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      duration:
        description: 'Duration of the bench'
        type: string
        default: 10m

  push:
    branches:
      - master
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
  schedule:
    -  cron: '04 2 * * *'

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  bench-server-baseline:
    runs-on: Standard_NC4as_T4_v3
    env:
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
      N_USERS: 8
      DURATION: 10m
    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Install python env
        id: pipenv
        run: |
          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt

      - name: Prometheus
        id: install_prometheus
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done

      - name: Install k6
        id: k6_installation
        run: |
          cd examples/server/bench
          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
          tar xzf k6*.tar.gz --strip-components=1

      - name: Build
        id: cmake_build
        run: |
          set -eux
          mkdir build
          cd build
          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
              -DCMAKE_CUDA_ARCHITECTURES=75 \
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
          cmake --build . --config Release -j $(nproc) --target server

      - name: Download the dataset
        id: download_dataset
        run: |
          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

      - name: Server bench
        id: server_bench
        run: |
          set -eux

          cd examples/server/bench
          source venv/bin/activate
          BENCH_K6_BIN_PATH=./k6 python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch ${{ github.head_ref || github.ref_name }} \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
              --hf-repo ggml-org/models	 \
              --hf-file phi-2/ggml-model-q4_0.gguf \
              --model-path-prefix /models \
              --parallel ${{ env.N_USERS }} \
              -ngl 33 \
              --batch-size 2048 \
              --ubatch-size	256 \
              --ctx-size 16384 \
              --n-prompts 1000 \
              --max-prompt-tokens 1024 \
              --max-tokens 2048

          cat results.github.env >> $GITHUB_ENV

          # Remove dataset as we do not want it in the artefact
          rm ShareGPT_V3_unfiltered_cleaned_split.json

      - uses: actions/upload-artifact@v4
        with:
          name: benchmark-results
          compression-level: 9
          path: |
            examples/server/bench/*.jpg
            examples/server/bench/*.json
            examples/server/bench/*.log

      - name: Commit status
        uses: Sibz/github-status-action@v1
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
          context: bench-server-baseline
          description: |
            ${{ env.BENCH_RESULTS }}
          state: 'success'

      - name: Upload benchmark images
        uses: devicons/public-upload-to-imgur@v2.2.2
        continue-on-error: true # Important as it looks unstable: 503
        id: imgur_step
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
            examples/server/bench/prompt_tokens_seconds.jpg
            examples/server/bench/predicted_tokens_seconds.jpg
            examples/server/bench/kv_cache_usage_ratio.jpg
            examples/server/bench/requests_processing.jpg

      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux

          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV

      - name: Extract image url
        id: extract_image_url
        continue-on-error: true
        run: |
          set -eux

          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV

      - name: Comment PR
        uses: mshick/add-pr-comment@v2
        id: comment_pr
        if: ${{ github.event.pull_request != '' }}
        with:
          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
          message: |
            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀

            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
            - ${{ env.BENCH_GRAPH_XLABEL }}

            <details>

            <summary>Time series</summary>

            <p align="center">

            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />

            <details>

            <summary>More</summary>

            ```mermaid
            ${{ env.PROMPT_TOKENS_SECONDS }}
            ```

            </details>

            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>

            <details>
                <summary>More</summary>

            ```mermaid
            ${{ env.PREDICTED_TOKENS_SECONDS }}
            ```

            </details>

            </p>

            <details>

            <summary>Details</summary>

            <p align="center">

            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />

            <details>
                <summary>More</summary>

            ```mermaid
            ${{ env.KV_CACHE_USAGE_RATIO }}
            ```

            </details>

            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>

            <details>
                <summary>More</summary>

            ```mermaid
            ${{ env.REQUESTS_PROCESSING }}
            ```

            </details>

            </p>
            </details>
            </details>
server: continuous performance monitoring and PR comment (#6283) * server: bench: init * server: bench: reduce list of GPU nodes * server: bench: fix graph, fix output artifact * ci: bench: add mermaid in case of image cannot be uploaded * ci: bench: more resilient, more metrics * ci: bench: trigger build * ci: bench: fix duration * ci: bench: fix typo * ci: bench: fix mermaid values, markdown generated * typo on the step name Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> * ci: bench: trailing spaces * ci: bench: move images in a details section * ci: bench: reduce bullet point size --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> 2024-03-27 20:26:49 +01:00			`# Benchmark`
			`name: Benchmark`

			`on:`
			`workflow_dispatch:`
			`inputs:`
			`gpu-series:`
			`description: 'Azure GPU series to run with'`
			`required: true`
			`type: choice`
			`options:`
			`- Standard_NC4as_T4_v3`
			`- Standard_NC24ads_A100_v4`
			`- Standard_NC80adis_H100_v5`
			`sha:`
			`description: 'Commit SHA1 to build'`
			`required: false`
			`type: string`
			`duration:`
			`description: 'Duration of the bench'`
			`type: string`
			`default: 10m`

			`push:`
			`branches:`
			`- master`
			`paths: ['.github/workflows/bench.yml', '/CMakeLists.txt', '/Makefile', '*/.h', '*/.hpp', '*/.c', '*/.cpp', '*/.cu', '*/.swift', '*/.m', 'examples/server/bench/*.']`
			`pull_request:`
			`types: [opened, synchronize, reopened]`
			`paths: ['.github/workflows/bench.yml', '/CMakeLists.txt', '/Makefile', '*/.h', '*/.hpp', '*/.c', '*/.cpp', '*/.cu', '*/.swift', '*/.m', 'examples/server/bench/*.']`
			`schedule:`
			`- cron: '04 2 * * *'`

			`concurrency:`
			`group: ${{ github.workflow }}-${{ github.ref }}`
			`cancel-in-progress: true`

			`jobs:`
			`bench-server-baseline:`
			`runs-on: Standard_NC4as_T4_v3`
			`env:`
			`RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it`
			`N_USERS: 8`
			`DURATION: 10m`
			`if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' \|\| github.event.schedule \|\| github.event.pull_request \|\| github.event.push.ref == 'refs/heads/master' }}`
			`steps:`
			`- name: Clone`
			`id: checkout`
			`uses: actions/checkout@v3`
			`with:`
			`fetch-depth: 0`
			`ref: ${{ github.event.inputs.sha \|\| github.event.pull_request.head.sha \|\| github.sha \|\| github.head_ref \|\| github.ref_name }}`

			`- name: Install python env`
			`id: pipenv`
			`run: \|`
			`cd examples/server/bench`
			`python3 -m venv venv`
			`source venv/bin/activate`
			`pip install -r requirements.txt`

			`- name: Prometheus`
			`id: install_prometheus`
			`run: \|`
			`wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz`
			`tar xzf prometheus*.tar.gz --strip-components=1`
			`./prometheus --config.file=examples/server/bench/prometheus.yml &`
			`while ! nc -z localhost 9090; do`
			`sleep 0.1`
			`done`

			`- name: Install k6`
			`id: k6_installation`
			`run: \|`
			`cd examples/server/bench`
			`wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz`
			`tar xzf k6*.tar.gz --strip-components=1`

			`- name: Build`
			`id: cmake_build`
			`run: \|`
			`set -eux`
			`mkdir build`
			`cd build`
			`cmake .. \`
			`-DLLAMA_NATIVE=OFF \`
			`-DLLAMA_BUILD_SERVER=ON \`
			`-DLLAMA_CURL=ON \`
			`-DLLAMA_CUBLAS=ON \`
			`-DCUDAToolkit_ROOT=/usr/local/cuda \`
			`-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \`
			`-DCMAKE_CUDA_ARCHITECTURES=75 \`
			`-DLLAMA_FATAL_WARNINGS=OFF \`
			`-DLLAMA_ALL_WARNINGS=OFF \`
			`-DCMAKE_BUILD_TYPE=Release;`
			`cmake --build . --config Release -j $(nproc) --target server`

			`- name: Download the dataset`
			`id: download_dataset`
			`run: \|`
			`cd examples/server/bench`
			`wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json`

			`- name: Server bench`
			`id: server_bench`
			`run: \|`
			`set -eux`

			`cd examples/server/bench`
			`source venv/bin/activate`
			`BENCH_K6_BIN_PATH=./k6 python bench.py \`
			`--runner-label ${{ env.RUNNER_LABEL }} \`
			`--name ${{ github.job }} \`
			`--branch ${{ github.head_ref \|\| github.ref_name }} \`
			`--commit ${{ github.event.inputs.sha \|\| github.event.pull_request.head.sha \|\| github.sha }} \`
			`--scenario script.js \`
			`--duration ${{ github.event.inputs.duration \|\| env.DURATION }} \`
			`--hf-repo ggml-org/models \`
			`--hf-file phi-2/ggml-model-q4_0.gguf \`
			`--model-path-prefix /models \`
			`--parallel ${{ env.N_USERS }} \`
			`-ngl 33 \`
			`--batch-size 2048 \`
			`--ubatch-size 256 \`
			`--ctx-size 16384 \`
			`--n-prompts 1000 \`
			`--max-prompt-tokens 1024 \`
			`--max-tokens 2048`

			`cat results.github.env >> $GITHUB_ENV`

			`# Remove dataset as we do not want it in the artefact`
			`rm ShareGPT_V3_unfiltered_cleaned_split.json`

			`- uses: actions/upload-artifact@v4`
			`with:`
			`name: benchmark-results`
			`compression-level: 9`
			`path: \|`
			`examples/server/bench/*.jpg`
			`examples/server/bench/*.json`
			`examples/server/bench/*.log`

			`- name: Commit status`
			`uses: Sibz/github-status-action@v1`
			`with:`
			`authToken: ${{secrets.GITHUB_TOKEN}}`
			`sha: ${{ inputs.sha \|\| github.event.pull_request.head.sha \|\| github.sha }}`
			`context: bench-server-baseline`
			`description: \|`
			`${{ env.BENCH_RESULTS }}`
			`state: 'success'`

			`- name: Upload benchmark images`
			`uses: devicons/public-upload-to-imgur@v2.2.2`
			`continue-on-error: true # Important as it looks unstable: 503`
			`id: imgur_step`
			`with:`
			`client_id: ${{secrets.IMGUR_CLIENT_ID}}`
			`path: \|`
			`examples/server/bench/prompt_tokens_seconds.jpg`
			`examples/server/bench/predicted_tokens_seconds.jpg`
			`examples/server/bench/kv_cache_usage_ratio.jpg`
			`examples/server/bench/requests_processing.jpg`

			`- name: Extract mermaid`
			`id: set_mermaid`
			`run: \|`
			`set -eux`

			`cd examples/server/bench`
			`PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)`
			`echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV`
			`echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV`
			`echo "EOF" >> $GITHUB_ENV`

			`PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)`
			`echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV`
			`echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV`
			`echo "EOF" >> $GITHUB_ENV`

			`KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)`
			`echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV`
			`echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV`
			`echo "EOF" >> $GITHUB_ENV`

			`REQUESTS_PROCESSING=$(cat requests_processing.mermaid)`
			`echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV`
			`echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV`
			`echo "EOF" >> $GITHUB_ENV`

			`- name: Extract image url`
			`id: extract_image_url`
			`continue-on-error: true`
			`run: \|`
			`set -eux`

			`echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV`
			`echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV`
			`echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV`
			`echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV`

			`- name: Comment PR`
			`uses: mshick/add-pr-comment@v2`
			`id: comment_pr`
			`if: ${{ github.event.pull_request != '' }}`
			`with:`
			`message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}`
			`message: \|`
			`📈 llama.cpp server for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: ${{ env.BENCH_ITERATIONS}} iterations 🚀`

			`- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration \|\| env.DURATION }}`
			`- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}`
			`- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s`
			`- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s`
			`- ${{ env.BENCH_GRAPH_XLABEL }}`

			`<details>`

			`<summary>Time series</summary>`

			`<p align="center">`

			`<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />`

			`<details>`

			`<summary>More</summary>`

			```mermaid
			`${{ env.PROMPT_TOKENS_SECONDS }}`
			```

			`</details>`

			`<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>`

			`<details>`
			`<summary>More</summary>`

			```mermaid
			`${{ env.PREDICTED_TOKENS_SECONDS }}`
			```

			`</details>`

			`</p>`

			`<details>`

			`<summary>Details</summary>`

			`<p align="center">`

			`<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />`

			`<details>`
			`<summary>More</summary>`

			```mermaid
			`${{ env.KV_CACHE_USAGE_RATIO }}`
			```

			`</details>`

			`<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>`

			`<details>`
			`<summary>More</summary>`

			```mermaid
			`${{ env.REQUESTS_PROCESSING }}`
			```

			`</details>`

			`</p>`
			`</details>`
			`</details>`