feat: Add automatic nightly benchmarks (#2591)

* feat: Add automatic nightly benchmarks * fix: Update runners group * fix: add created_at field to results * fix: Add variable results file location

feat: Add automatic nightly benchmarks (#2591)
* feat: Add automatic nightly benchmarks * fix: Update runners group * fix: add created_at field to results * fix: Add variable results file location
d5bc6a20 · Hugo Larcher · GitHub · d012f229 · d5bc6a20 · d012f229
Unverified Commit d5bc6a20 authored Nov 21, 2024 by Hugo Larcher Committed by GitHub Nov 21, 2024
8 changed files
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -3,12 +3,17 @@ name: Nightly load test
 on:
  schedule:
    - cron: '0 0 * * 1-5'
+  workflow_call:
+  workflow_dispatch:
  pull_request:
    paths:
      - ".github/workflows/load_test.yaml"
-    branches:
-      - 'main'
+env:
+  AWS_DEFAULT_REGION: us-east-1
+  AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
 jobs:
  load-tests:
@@ -16,28 +21,30 @@ jobs:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    runs-on:
-      group: aws-g5-12xlarge
+      group: aws-g6-12xl-plus-priv-cache
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
-      - name: Install k6
+      - name: Install Python 3.11
-        run: |
+        uses: actions/setup-python@v2
-          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
+        with:
+          python-version: 3.11
-      - name: Start starcoder
-        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
-          sleep 10
-          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
-      - name: Run k6
+      - name: Install poetry
        run: |
-          ./k6 run load_tests/starcoder_load.js
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+          poetry --version
-      - name: Stop starcoder
+      - name: Run bench test
-        if: ${{ always() }}
        run: |
-          docker stop tgi-starcoder || true
+          export PATH="$HOME/.local/bin:$PATH"
+          cd load_tests
+          poetry install
+          poetry run python benchmarks.py --sha ${{ github.sha }} --results-file "s3://text-generation-inference-ci/benchmarks/ci/${{ github.sha }}.parquet"
+        shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}
--- a/load_tests/Makefile
+++ b/load_tests/Makefile
-ShareGPT_V3_unfiltered_cleaned_split.json:
-	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
-	python filter.py
-prepare_orca:
-	python orca.py
--- a/load_tests/benchmarks.py
+++ b/load_tests/benchmarks.py
+import argparse
+import datetime
+import json
+import os
+import traceback
+from typing import Dict, Tuple, List
+import GPUtil
+import docker
+from docker.models.containers import Container
+from loguru import logger
+import pandas as pd
+class InferenceEngineRunner:
+    def __init__(self, model: str):
+        self.model = model
+    def run(self, parameters: list[tuple], gpus: int = 0):
+        NotImplementedError("This method should be implemented by the subclass")
+    def stop(self):
+        NotImplementedError("This method should be implemented by the subclass")
+class TGIDockerRunner(InferenceEngineRunner):
+    def __init__(self,
+                 model: str,
+                 image: str = "ghcr.io/huggingface/text-generation-inference:latest",
+                 volumes=None):
+        super().__init__(model)
+        if volumes is None:
+            volumes = []
+        self.container = None
+        self.image = image
+        self.volumes = volumes
+    def run(self, parameters: list[tuple], gpus: int = 0):
+        params = f"--model-id {self.model} --port 8080"
+        for p in parameters:
+            params += f" --{p[0]} {str(p[1])}"
+        logger.info(f"Running TGI with parameters: {params}")
+        volumes = {}
+        for v in self.volumes:
+            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
+        self.container = run_docker(self.image, params,
+                                    "Connected",
+                                    "ERROR",
+                                    volumes=volumes,
+                                    gpus=gpus,
+                                    ports={"8080/tcp": 8080}
+                                    )
+    def stop(self):
+        if self.container:
+            self.container.stop()
+class BenchmarkRunner:
+    def __init__(self,
+                 image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest",
+                 volumes: List[Tuple[str, str]] = None):
+        if volumes is None:
+            volumes = []
+        self.container = None
+        self.image = image
+        self.volumes = volumes
+    def run(self, parameters: list[tuple], network_mode):
+        params = "text-generation-inference-benchmark"
+        for p in parameters:
+            params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}"
+        logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}")
+        volumes = {}
+        for v in self.volumes:
+            volumes[v[0]] = {"bind": v[1], "mode": "rw"}
+        self.container = run_docker(self.image, params,
+                                    "Benchmark finished",
+                                    "Fatal:",
+                                    volumes=volumes,
+                                    extra_env={"RUST_LOG": "text_generation_inference_benchmark=info",
+                                               "RUST_BACKTRACE": "full"},
+                                    network_mode=network_mode)
+    def stop(self):
+        if self.container:
+            self.container.stop()
+def run_docker(image: str, args: str, success_sentinel: str,
+               error_sentinel: str, ports: Dict[str, int] = None, volumes=None, network_mode: str = "bridge",
+               gpus: int = 0, extra_env: Dict[str, str] = None) -> Container:
+    if ports is None:
+        ports = {}
+    if volumes is None:
+        volumes = {}
+    if extra_env is None:
+        extra_env = {}
+    client = docker.from_env(timeout=300)
+    # retrieve the GPU devices from CUDA_VISIBLE_DEVICES
+    devices = [f"{i}" for i in
+               range(get_num_gpus())][:gpus]
+    environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
+    environment.update(extra_env)
+    container = client.containers.run(image, args,
+                                      detach=True,
+                                      device_requests=[
+                                          docker.types.DeviceRequest(device_ids=devices,
+                                                                     capabilities=[['gpu']])
+                                      ] if gpus > 0 else None,
+                                      volumes=volumes,
+                                      shm_size="1g",
+                                      ports=ports,
+                                      network_mode=network_mode,
+                                      environment=environment, )
+    for line in container.logs(stream=True):
+        print(line.decode("utf-8"), end="")
+        if success_sentinel.encode("utf-8") in line:
+            break
+        if error_sentinel.encode("utf-8") in line:
+            container.stop()
+            raise Exception(f"Error starting container: {line}")
+    return container
+def get_gpu_names() -> str:
+    gpus = GPUtil.getGPUs()
+    if len(gpus) == 0:
+        return ''
+    return f'{len(gpus)}x{gpus[0].name if gpus else "No GPU available"}'
+def get_gpu_name() -> str:
+    gpus = GPUtil.getGPUs()
+    if len(gpus) == 0:
+        return ''
+    return gpus[0].name
+def get_num_gpus() -> int:
+    return len(GPUtil.getGPUs())
+def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
+    df = pd.DataFrame()
+    now = datetime.datetime.now(datetime.timezone.utc)
+    created_at = now.isoformat()  # '2024-10-02T11:53:17.026215+00:00'
+    # Load the results
+    for key, filename in data_files.items():
+        with open(filename, 'r') as f:
+            data = json.load(f)
+            for result in data['results']:
+                entry = result
+                [config] = pd.json_normalize(result['config']).to_dict(orient='records')
+                entry.update(config)
+                entry['engine'] = data['config']['meta']['engine']
+                entry['tp'] = data['config']['meta']['tp']
+                entry['version'] = data['config']['meta']['version']
+                entry['model'] = model
+                entry['created_at'] = created_at
+                del entry['config']
+                df = pd.concat([df, pd.DataFrame(entry, index=[0])])
+    return df
+def main(sha, results_file):
+    results_dir = 'results'
+    # get absolute path
+    results_dir = os.path.join(os.path.dirname(__file__), results_dir)
+    logger.info('Starting benchmark')
+    models = [
+        ('meta-llama/Llama-3.1-8B-Instruct', 1),
+        # ('meta-llama/Llama-3.1-70B-Instruct', 4),
+        # ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
+    ]
+    success = True
+    for model in models:
+        tgi_runner = TGIDockerRunner(model[0])
+        # create results directory
+        model_dir = os.path.join(results_dir, f'{model[0].replace("/", "_").replace(".", "_")}')
+        os.makedirs(model_dir, exist_ok=True)
+        runner = BenchmarkRunner(
+            volumes=[(model_dir, '/opt/text-generation-inference-benchmark/results')]
+        )
+        try:
+            tgi_runner.run([('max-concurrent-requests', 512)], gpus=model[1])
+            logger.info(f'TGI started for model {model[0]}')
+            parameters = [
+                ('tokenizer-name', model[0]),
+                ('max-vus', 800),
+                ('url', 'http://localhost:8080'),
+                ('duration', '120s'),
+                ('warmup', '30s'),
+                ('benchmark-kind', 'rate'),
+                ('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
+                ('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
+                ('extra-meta', f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"'),
+                ('no-console', None)
+            ]
+            rates = [('rates', f'{r / 10.}') for r in list(range(8, 248, 8))]
+            parameters.extend(rates)
+            runner.run(parameters, f'container:{tgi_runner.container.id}')
+        except Exception as e:
+            logger.error(f'Error running benchmark for model {model[0]}: {e}')
+            # print the stack trace
+            print(traceback.format_exc())
+            success = False
+        finally:
+            tgi_runner.stop()
+            runner.stop()
+    if not success:
+        logger.error('Some benchmarks failed')
+        exit(1)
+    df = pd.DataFrame()
+    # list recursively directories
+    directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')]
+    logger.info(f'Found result directories: {directories}')
+    for directory in directories:
+        data_files = {}
+        for filename in os.listdir(directory):
+            if filename.endswith('.json'):
+                data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
+        logger.info(f'Processing directory {directory}')
+        df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
+    df['device'] = get_gpu_name()
+    df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
+    df.to_parquet(results_file)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sha", help="SHA of the commit to add to the results", required=True)
+    parser.add_argument("--results-file",
+                        help="The file where to store the results, can be a local file or a s3 path")
+    args = parser.parse_args()
+    if args.results_file is None:
+        results_file = f'{args.sha}.parquet'
+    else:
+        results_file = args.results_file
+    main(args.sha, results_file)
--- a/load_tests/common.js
+++ b/load_tests/common.js
-import { check } from 'k6';
-import { scenario } from 'k6/execution';
-import http from 'k6/http';
-import { Trend, Counter } from 'k6/metrics';
-const host = __ENV.HOST;
-const model_id = __ENV.MODEL_ID;
-const timePerToken = new Trend('time_per_token', true);
-const tokens = new Counter('tokens');
-const new_tokens = new Counter('new_tokens');
-const input_tokens = new Counter('input_tokens');
-const max_new_tokens = 50;
-// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
-const shareGPT = JSON.parse(open("small.json"))
-export function get_options() {
-    return {
-        thresholds: {
-            http_req_failed: ['rate==0'],
-            // time_per_token: [{
-            //     threshold: `p(50)<${5 * reference_latency_ms}`,
-            //     abortOnFail: true,
-            //     delayAbortEval: '10s'
-            // }],
-        },
-        scenarios: {
-            // single_user: {
-            //     executor: 'constant-arrival-rate',
-            //     duration: '60s',
-            //     preAllocatedVUs: 1,
-            //     rate: 20,
-            //     timeUnit: '1s',
-            // },
-            // load_test: {
-            //     executor: 'constant-arrival-rate',
-            //     duration: '60s',
-            //     preAllocatedVUs: 100,
-            //     rate: 1,
-            //     timeUnit: '1s',
-            // },
-            // breakpoint: {
-            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
-            //     preAllocatedVUs: 300,
-            //     stages: [
-            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
-            //     ],
-            // },
-            throughput: {
-                executor: 'shared-iterations',
-                vus: 100,
-                iterations: 200,
-                maxDuration: '40s',
-            },
-        },
-    };
-}
-function generate_payload(gpt, max_new_tokens) {
-    const input = gpt["conversations"][0]["value"];
-    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
-}
-export const options = get_options();
-export default function run() {
-    const headers = { 'Content-Type': 'application/json' };
-    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
-    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
-    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
-        headers,
-    });
-    if (res.status >= 400 && res.status < 500) {
-        return;
-    }
-    check(res, {
-        'Post status is 200': (res) => res.status === 200,
-    });
-    const duration = res.timings.duration;
-    if (res.status === 200) {
-        const body = res.json();
-        const completion_tokens = body.usage.completion_tokens;
-        const latency_ms_per_token = duration / completion_tokens;
-        timePerToken.add(latency_ms_per_token);
-        const prompt_tokens = body.usage.prompt_tokens;
-        input_tokens.add(prompt_tokens);
-        new_tokens.add(completion_tokens);
-        tokens.add(completion_tokens + prompt_tokens);
-    }
-}
--- a/load_tests/filter.py
+++ b/load_tests/filter.py
-import json
-def main():
-    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
-        data = json.load(f)
-    # Select only the first 2k conversations that start with a human.
-    max = 2000
-    conversations = []
-    for conversation in data:
-        conv = conversation.get("conversations")
-        if conv and conv[0]["from"] == "human":
-            # Trim the rest of the output
-            conversation["conversations"] = conversation["conversations"][:1]
-            conversations.append(conversation)
-            if len(conversation) >= max:
-                break
-    with open("./small.json", "w") as f:
-        data = json.dump(conversations, f, indent=4)
-if __name__ == "__main__":
-    main()
--- a/load_tests/orca.py
+++ b/load_tests/orca.py
-import json
-import datasets
-import tqdm
-def main():
-    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
-    # Select only the first 2k conversations that start with a human.
-    max = min(2000, len(dataset))
-    conversations = []
-    for item in tqdm.tqdm(dataset, total=max):
-        conversation = {
-            "conversations": [
-                {"from": "human", "value": item["question"]},
-            ],
-            "id": item["id"],
-        }
-        conversations.append(conversation)
-        if len(conversations) >= max:
-            break
-    with open("./small.json", "w") as f:
-        json.dump(conversations, f, indent=4)
-if __name__ == "__main__":
-    main()
--- a/load_tests/poetry.lock
+++ b/load_tests/poetry.lock
--- a/load_tests/pyproject.toml
+++ b/load_tests/pyproject.toml
+[tool.poetry]
+name = "text-generation-inference-benchmarks"
+version = "0.1.0"
+description = ""
+authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.11"
+docker = "^7.1.0"
+loguru = "^0.7.2"
+psutil = "^6.0.0"
+gputil = "^1.4.0"
+pandas = "^2.2.3"
+pyarrow = "^17.0.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
\ No newline at end of file