Unverified Commit e7c4f9ee authored by Ye (Charlotte) Qi's avatar Ye (Charlotte) Qi Committed by GitHub
Browse files

[CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)


Signed-off-by: default avatarYe (Charlotte) Qi <yeq@meta.com>
parent 9094d11c
...@@ -227,7 +227,7 @@ run_serving_tests() { ...@@ -227,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
...@@ -248,7 +248,7 @@ run_serving_tests() { ...@@ -248,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--backend $backend \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \ --model $model \
......
...@@ -206,7 +206,7 @@ run_latency_tests() { ...@@ -206,7 +206,7 @@ run_latency_tests() {
fi fi
fi fi
latency_command=" $latency_envs python3 benchmark_latency.py \ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args" $latency_args"
...@@ -273,7 +273,7 @@ run_throughput_tests() { ...@@ -273,7 +273,7 @@ run_throughput_tests() {
fi fi
fi fi
throughput_command=" $throughput_envs python3 benchmark_throughput.py \ throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \ --output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args" $throughput_args"
...@@ -394,7 +394,7 @@ run_serving_tests() { ...@@ -394,7 +394,7 @@ run_serving_tests() {
# pass the tensor parallel size to the client so that it can be displayed # pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard # on the benchmark dashboard
client_command="python3 benchmark_serving.py \ client_command="vllm bench serve \
--save-result \ --save-result \
--result-dir $RESULTS_FOLDER \ --result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
......
...@@ -83,7 +83,7 @@ function cpu_tests() { ...@@ -83,7 +83,7 @@ function cpu_tests() {
set -e set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \
......
...@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.." ...@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite # run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$? bench_latency_exit_code=$?
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$? bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite # run server-based benchmarks and upload the result to buildkite
...@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r ...@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds # wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name sharegpt \ --dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
......
...@@ -77,7 +77,7 @@ done ...@@ -77,7 +77,7 @@ done
echo "run benchmark test..." echo "run benchmark test..."
echo "logging to $BM_LOG" echo "logging to $BM_LOG"
echo echo
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name sonnet \ --dataset-name sonnet \
......
...@@ -98,7 +98,7 @@ Then run the benchmarking script ...@@ -98,7 +98,7 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
...@@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests ...@@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
```bash ```bash
# run benchmarking script # run benchmarking script
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ vllm bench serve --port 9001 --save-result --save-detailed \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--endpoint /v1/completions \ --endpoint /v1/completions \
...@@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ...@@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
...@@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ ...@@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
``` ```
``` bash ``` bash
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--model meta-llama/Meta-Llama-3-8B-Instruct \ --model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \ --dataset-name hf \
--dataset-path likaixin/InstructCoder \ --dataset-path likaixin/InstructCoder \
...@@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ...@@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
...@@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ ...@@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
...@@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ ...@@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \ --dataset-path AI-MO/aimo-validation-aime \
...@@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \ ...@@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
**`philschmid/mt-bench`** **`philschmid/mt-bench`**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--dataset-name hf \ --dataset-name hf \
--dataset-path philschmid/mt-bench \ --dataset-path philschmid/mt-bench \
...@@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling ...@@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command: parameters can be specified. Example client command:
```bash ```bash
python3 vllm/benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--endpoint /v1/completions \ --endpoint /v1/completions \
...@@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up: ...@@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
<br/> <br/>
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model NousResearch/Hermes-3-Llama-3.1-8B \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name sonnet \ --dataset-name sonnet \
--dataset-path vllm/benchmarks/sonnet.txt \ --dataset-path vllm/benchmarks/sonnet.txt \
...@@ -314,7 +314,7 @@ Total num output tokens: 1500 ...@@ -314,7 +314,7 @@ Total num output tokens: 1500
**VisionArena Benchmark for Vision Language Models** **VisionArena Benchmark for Vision Language Models**
``` bash ``` bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
...@@ -336,7 +336,7 @@ Total num output tokens: 1280 ...@@ -336,7 +336,7 @@ Total num output tokens: 1280
``` bash ``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \ VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--dataset-name=hf \ --dataset-name=hf \
--dataset-path=likaixin/InstructCoder \ --dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \ --model=meta-llama/Meta-Llama-3-8B-Instruct \
...@@ -360,7 +360,7 @@ Total num output tokens: 204800 ...@@ -360,7 +360,7 @@ Total num output tokens: 204800
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
...@@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \ ...@@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \ --backend vllm-chat \
--dataset-name hf \ --dataset-name hf \
...@@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \ ...@@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
**`AI-MO/aimo-validation-aime`** **`AI-MO/aimo-validation-aime`**
```bash ```bash
python3 benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model Qwen/QwQ-32B \ --model Qwen/QwQ-32B \
--backend vllm \ --backend vllm \
--dataset-name hf \ --dataset-name hf \
...@@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \ ...@@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 vllm/benchmarks/benchmark_throughput.py \ vllm bench throughput \
--model meta-llama/Llama-2-7b-hf \ --model meta-llama/Llama-2-7b-hf \
--backend vllm \ --backend vllm \
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \ --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
......
...@@ -136,7 +136,7 @@ run_benchmark() { ...@@ -136,7 +136,7 @@ run_benchmark() {
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len )) adjusted_input_len=$(( INPUT_LEN - prefix_len ))
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \
...@@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len )) ...@@ -169,7 +169,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5 sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
python3 benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name random \ --dataset-name random \
......
...@@ -11,6 +11,7 @@ from typing import Any, Optional ...@@ -11,6 +11,7 @@ from typing import Any, Optional
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from typing_extensions import deprecated
import vllm.envs as envs import vllm.envs as envs
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
...@@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format( ...@@ -34,6 +35,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_latency.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench latency' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
......
...@@ -38,6 +38,7 @@ from typing import Any, Literal, Optional ...@@ -38,6 +38,7 @@ from typing import Any, Literal, Optional
import numpy as np import numpy as np
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from typing_extensions import deprecated
from backend_request_func import ( from backend_request_func import (
ASYNC_REQUEST_FUNCS, ASYNC_REQUEST_FUNCS,
...@@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format( ...@@ -593,6 +594,10 @@ def save_to_pytorch_benchmark_format(
write_to_json(pt_file, pt_records) write_to_json(pt_file, pt_records)
@deprecated(
"benchmark_serving.py is deprecated and will be removed in a future "
"version. Please use 'vllm bench serve' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
print(args) print(args)
random.seed(args.seed) random.seed(args.seed)
......
...@@ -15,6 +15,7 @@ import torch ...@@ -15,6 +15,7 @@ import torch
import uvloop import uvloop
from tqdm import tqdm from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from typing_extensions import deprecated
from benchmark_dataset import ( from benchmark_dataset import (
AIMODataset, AIMODataset,
...@@ -382,6 +383,10 @@ def get_requests(args, tokenizer): ...@@ -382,6 +383,10 @@ def get_requests(args, tokenizer):
return dataset_cls(**common_kwargs).sample(**sample_kwargs) return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
"benchmark_throughput.py is deprecated and will be removed in a "
"future version. Please use 'vllm bench throughput' instead.",
)
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
if args.seed is None: if args.seed is None:
args.seed = 0 args.seed = 0
......
...@@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \ ...@@ -38,7 +38,7 @@ VLLM_TORCH_PROFILER_DIR=./vllm_profile \
benchmark_serving.py: benchmark_serving.py:
```bash ```bash
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Meta-Llama-3-70B \ --model meta-llama/Meta-Llama-3-70B \
--dataset-name sharegpt \ --dataset-name sharegpt \
...@@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script: ...@@ -75,7 +75,7 @@ The following is an example using the `benchmarks/benchmark_latency.py` script:
nsys profile -o report.nsys-rep \ nsys profile -o report.nsys-rep \
--trace-fork-before-exec=true \ --trace-fork-before-exec=true \
--cuda-graph-trace=node \ --cuda-graph-trace=node \
python benchmarks/benchmark_latency.py \ vllm bench latency \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-iters-warmup 5 \ --num-iters-warmup 5 \
--num-iters 1 \ --num-iters 1 \
...@@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \ ...@@ -98,7 +98,7 @@ nsys profile -o report.nsys-rep \
vllm serve meta-llama/Llama-3.1-8B-Instruct vllm serve meta-llama/Llama-3.1-8B-Instruct
# client # client
python benchmarks/benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--num-prompts 1 \ --num-prompts 1 \
......
...@@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ ...@@ -291,7 +291,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
??? console "Command" ??? console "Command"
```shell ```shell
python3 benchmark_serving.py \ vllm bench serve \
--backend vllm \ --backend vllm \
--model base_model \ --model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \ --tokenizer meta-llama/Llama-3.1-8B-Instruct \
......
...@@ -233,7 +233,7 @@ main() { ...@@ -233,7 +233,7 @@ main() {
# Run Benchmark # Run Benchmark
# ============================================================================= # =============================================================================
cd ../../../benchmarks/ cd ../../../benchmarks/
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \ vllm bench serve --port 10001 --seed $(date +%s) \
--model $MODEL \ --model $MODEL \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
......
...@@ -122,7 +122,7 @@ main() { ...@@ -122,7 +122,7 @@ main() {
# begin benchmark # begin benchmark
cd ../../../../benchmarks/ cd ../../../../benchmarks/
python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ vllm bench serve --port 9000 --seed $(date +%s) \
--model meta-llama/Llama-3.1-8B-Instruct \ --model meta-llama/Llama-3.1-8B-Instruct \
--dataset-name random --random-input-len 7500 --random-output-len 200 \ --dataset-name random --random-input-len 7500 --random-output-len 200 \
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment