Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -32,6 +32,7 @@ steps:
 ##### fast check tests  #####
 - label: Documentation Build # 2min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/test_docs/docs"
  fast_check: true
  no_gpu: True
@@ -39,9 +40,10 @@ steps:
  - pip install -r ../../requirements/docs.txt
  - SPHINXOPTS=\"-W\" make html
  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/inference_params.html
+  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/mq_llm_engine
@@ -62,6 +64,7 @@ steps:
  - pytest -v -s worker # Worker
 - label: Python-only Installation Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
@@ -69,7 +72,7 @@ steps:
  - bash standalone_tests/python_only_compile.sh
 - label: Basic Correctness Test # 30min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
@@ -86,6 +89,7 @@ steps:
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 - label: Chunked Prefill Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_chunked_prefill
@@ -94,7 +98,7 @@ steps:
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 - label: Core Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  fast_check: true
  source_file_dependencies:
  - vllm/core
@@ -104,10 +108,10 @@ steps:
  - pytest -v -s core
 - label: Entrypoints Test # 40min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
-  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@@ -126,6 +130,7 @@ steps:
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -143,6 +148,8 @@ steps:
  # test with tp=2 and external_dp=2
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
@@ -153,12 +160,12 @@ steps:
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
-  - python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
@@ -172,7 +179,7 @@ steps:
 #####  1 GPU test  #####
 - label: Regression Test # 5min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@@ -182,7 +189,7 @@ steps:
  working_dir: "/vllm-workspace/tests" # optional
 - label: Engine Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/engine
@@ -196,7 +203,7 @@ steps:
  - pytest -v -s tokenization
 - label: V1 Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
    - tests/v1
@@ -209,8 +216,8 @@ steps:
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/kv_connector/unit
    - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    # TODO: accuracy does not match, whether setting
@@ -221,8 +228,8 @@ steps:
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 - label: Examples Test # 25min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
-  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/entrypoints
  - examples/
@@ -246,7 +253,7 @@ steps:
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 - label: Prefix Caching Test # 9min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/prefix_caching
@@ -254,6 +261,7 @@ steps:
    - pytest -v -s prefix_caching
 - label: Samplers Test # 36min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
@@ -264,7 +272,7 @@ steps:
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 - label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/model_executor/guided_decoding
@@ -275,6 +283,7 @@ steps:
    - pytest -v -s model_executor/test_guided_processors.py
 - label: Speculative decoding tests # 40min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/spec_decode
  - tests/spec_decode
@@ -285,7 +294,7 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 - label: LoRA Test %N # 15min each
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
@@ -293,15 +302,20 @@ steps:
  parallelism: 4
 - label: PyTorch Compilation Unit Tests
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_silu_mul_quant_fusion.py
    - pytest -v -s compile/test_sequence_parallelism.py
 - label: PyTorch Fullgraph Smoke Test # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -312,6 +326,8 @@ steps:
  - pytest -v -s compile/piecewise/test_toy_llama.py
 - label: PyTorch Fullgraph Test # 18min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -319,6 +335,7 @@ steps:
  - pytest -v -s compile/test_full_graph.py
 - label: Kernels Core Operation Test
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
@@ -326,6 +343,7 @@ steps:
    - pytest -v -s kernels/core
 - label: Kernels Attention Test %N
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
@@ -336,6 +354,7 @@ steps:
  parallelism: 2
 - label: Kernels Quantization Test %N
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
@@ -345,6 +364,7 @@ steps:
  parallelism: 2
 - label: Kernels MoE Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/moe/
  - tests/kernels/moe
@@ -353,6 +373,7 @@ steps:
    - pytest -v -s kernels/moe
 - label: Kernels Mamba Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
@@ -360,7 +381,7 @@ steps:
    - pytest -v -s kernels/mamba
 - label: Tensorizer Test # 11min
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@@ -371,37 +392,42 @@ steps:
    - pytest -v -s tensorizer_loader
 - label: Benchmarks # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
  source_file_dependencies:
  - benchmarks/
  commands:
  - bash scripts/run-benchmarks.sh
 - label: Benchmarks CLI Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
- label: Quantization Test # 33min
+- label: Quantization Test
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
-  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  commands:
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 - label: LM Eval Small Models # 53min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 - label: OpenAI API correctness
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
@@ -410,6 +436,7 @@ steps:
  - pytest -s entrypoints/openai/correctness/
 - label: Encoder Decoder tests # 5min
+  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/encoder_decoder
@@ -417,8 +444,8 @@ steps:
    - pytest -v -s encoder_decoder
 - label: OpenAI-Compatible Tool Use # 20 min
+  mirror_hardwares: [amdexperimental]
  fast_check: false
-  #mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
@@ -430,92 +457,98 @@ steps:
 #####  models test  #####
 - label: Basic Models Test # 24min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/models
  commands:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_utils.py
+    - pytest -v -s models/test_vision.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- label: Language Models Test (Standard) # 32min
+- label: Language Models Test (Standard)
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/language
+  - tests/models/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/embedding/language -m core_model
+    - pytest -v -s models/language -m core_model
- label: Language Models Test (Extended) # 1h10min
+- label: Language Models Test (Extended)
+  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/language
+  - tests/models/language
-  - tests/models/embedding/language
-  - tests/models/encoder_decoder/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/language -m 'not core_model'
-    - pytest -v -s models/embedding/language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) # 40min
+- label: Multi-Modal Models Test (Standard)
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
+  torch_nightly: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/audio_language
+  - tests/models/multimodal
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/audio_language
-  - tests/models/encoder_decoder/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal
+    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s models/multimodal/processing
-    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
-    - pytest -v -s models/embedding/vision_language -m core_model
+    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
-    - pytest -v -s models/encoder_decoder/audio_language -m core_model
-    - pytest -v -s models/encoder_decoder/language -m core_model
+- label: Multi-Modal Models Test (Extended) 1
-    - pytest -v -s models/encoder_decoder/vision_language -m core_model
+  mirror_hardwares: [amdexperimental]
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
- label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/audio_language
+  - tests/models/multimodal
-  - tests/models/decoder_only/vision_language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/vision_language
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+- label: Multi-Modal Models Test (Extended) 2
-    - pytest -v -s models/embedding/vision_language -m 'not core_model'
+  mirror_hardwares: [amdexperimental]
-    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 # 38m
  optional: true
  source_file_dependencies:
  - vllm/
-  - tests/models/decoder_only/vision_language
+  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+- label: Multi-Modal Models Test (Extended) 3
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+- label: Quantized Models Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  optional: true
  commands:
    - echo 'Testing custom models...'
@@ -527,7 +560,7 @@ steps:
 #####  multi gpus test  #####
 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@@ -538,6 +571,7 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py
 - label: 2 Node Tests (4 GPUs in total) # 16min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  num_nodes: 2
@@ -556,7 +590,7 @@ steps:
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 - label: Distributed Tests (2 GPUs) # 40min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@@ -581,9 +615,8 @@ steps:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
@@ -594,13 +627,14 @@ steps:
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 - label: Plugin Tests (2 GPUs) # 40min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  commands:
-  # begin platform plugin tests, all the code in-between runs on dummy platform
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
@@ -611,8 +645,10 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 - label: Multi-step Tests (4 GPUs) # 36min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -633,6 +669,7 @@ steps:
  - pytest -v -s multi_step/test_correctness_llm.py
 - label: Pipeline Parallelism Test # 45min
+  mirror_hardwares: [amdexperimental, amdproduction]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -646,6 +683,7 @@ steps:
  - pytest -v -s distributed/test_pipeline_parallel.py
 - label: LoRA TP Test (Distributed)
+  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
@@ -661,6 +699,7 @@ steps:
 - label: Weight Loading Multiple GPU Test  # 33min
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@@ -670,6 +709,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 - label: Weight Loading Multiple GPU Test - Large Models # optional
+  mirror_hardwares: [amdexperimental] 
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
@@ -708,4 +748,4 @@ steps:
  - vllm/model_executor/layers/quantization
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -21,12 +21,12 @@ body:
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
    value: |
      <details>
-      <summary>The output of `python collect_env.py`</summary>
+      <summary>The output of <code>python collect_env.py</code></summary>
      ```text
      Your output of `python collect_env.py` here
      ```
      </details>
  validations:
    required: true
@@ -75,7 +75,7 @@ body:
      ```
      ```
-      The error message you got, with the full traceback.
+      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
      ```
  validations:
    required: true

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -163,6 +163,17 @@ pull_request_rules:
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+- name: assign reviewer for tensorizer changes
+  conditions:
+      - files~=^vllm/model_executor/model_loader/tensorizer.py
+      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
+      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/tensorizer_loader/
+  actions:
+    assign:
+      users:
+        - "sangstar"
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict

--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
 name: Add label on auto-merge enabled
+permissions:
+    pull-requests: write
 on:
    pull_request_target:
        types:

--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -2,6 +2,9 @@ name: Lint and Deploy Charts
 on: pull_request
+permissions:
+  contents: read
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
@@ -66,7 +69,7 @@ jobs:
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@@ -79,4 +82,4 @@ jobs:
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
          echo "$CODE"
\ No newline at end of file
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,6 +5,9 @@ on:
  push:
    branches: [main]
+permissions:
+  contents: read
 jobs:
  pre-commit:
    runs-on: ubuntu-latest

--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
 name: PR Reminder Comment Bot
+permissions:
+  pull-requests: write
 on:
  pull_request_target:
    types: [opened]

--- a/.gitignore
+++ b/.gitignore
@@ -80,6 +80,7 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/source/getting_started/examples/
+docs/source/api/vllm
 # PyBuilder
 .pybuilder/

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,29 +12,31 @@ repos:
  - id: yapf
    args: [--in-place, --verbose]
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.3
+  rev: v0.11.7
  hooks:
  - id: ruff
    args: [--output-format, github, --fix]
+  - id: ruff-format
+    files: ^(.buildkite|benchmarks)/.*
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.4.0
+  rev: v2.4.1
  hooks:
  - id: codespell
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
+  rev: 6.0.1
  hooks:
  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v19.1.7
+  rev: v20.1.3
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
    types_or: [c++, cuda]
    args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
-  rev: v0.9.27
+  rev: v0.9.29
  hooks:
  - id: pymarkdown
    args: [fix]
@@ -43,10 +45,10 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.2
+  rev: 0.6.17
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
@@ -101,8 +103,8 @@ repos:
    args:
      - -c
      - |
-        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
-          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
        fi
    language: system
    verbose: true
@@ -125,8 +127,6 @@ repos:
    name: Update Dockerfile dependency graph
    entry: tools/update-dockerfile-graph.sh
    language: script
-    files: ^docker/Dockerfile$
-    pass_filenames: false
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@@ -46,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -231,6 +230,7 @@ set(VLLM_EXT_SRC
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/attention/merge_attn_states.cu"
+  "csrc/attention/vertical_slash_index.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
  # "csrc/quantization/fp8/common.cu"
  # "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
+  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
@@ -250,9 +251,8 @@ set(VLLM_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
-  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
+  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  # Please keep this in sync with FetchContent_Declare line below.
+  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
-  set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -270,7 +270,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
        # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.9.0
+        GIT_TAG ${CUTLASS_REVISION}
        GIT_PROGRESS TRUE
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -290,6 +290,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
    "csrc/attention/mla/cutlass_mla_entry.cu")
@@ -301,10 +302,55 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
+    #
+    # For the Marlin kernels we automatically generate sources for various
+    # preselected input type pairs and schedules.
+    # Generate sources:
+    set(MARLIN_GEN_SCRIPT
+      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
+    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+      execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env
+        PYTHONPATH=$PYTHONPATH
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+        RESULT_VARIABLE marlin_generation_result
+        OUTPUT_VARIABLE marlin_generation_result
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
+      )
+      if (NOT marlin_generation_result EQUAL 0)
+        message(FATAL_ERROR "Marlin generation failed."
+                            " Result: \"${marlin_generation_result}\""
+                            "\nCheck the log for details: "
+                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
+      else()
+        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        message(STATUS "Marlin generation completed successfully.")
+      endif()
+    else()
+      message(STATUS "Marlin generation script has not changed, skipping generation.")
+    endif()
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_ARCHS}")
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
    set(MARLIN_SRCS
-       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
@@ -376,6 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(SRCS
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@@ -400,8 +447,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
+  # (Build 8.9 for FP8)
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@@ -452,7 +500,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
@@ -490,7 +540,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
  # to compile MoE kernels that use its output.
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
@@ -628,7 +678,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
+  # 9.0 for latest bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    #
@@ -646,7 +697,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+        PYTHONPATH=$PYTHONPATH
          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
@@ -682,6 +733,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
 endif()
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(MOE_PERMUTE_SRC
+      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+      "csrc/moe/moe_permute_unpermute_op.cu")
+  set_gencode_flags_for_srcs(
+    SRCS "${MARLIN_PERMUTE_SRC}"
+    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
+  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
+endif()
 message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
@@ -690,6 +752,8 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
  USE_SABI 3
  WITH_SOABI)

--- a/README.md
+++ b/README.md
@@ -16,18 +16,20 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 *Latest News* 🔥
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
+<details>
+<summary>Previous News</summary>
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
-<details>
-<summary>Previous News</summary>
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
@@ -72,7 +74,7 @@ vLLM is flexible and easy to use with:
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
 - Prefix caching support
- Multi-lora support
+- Multi-LoRA support
 vLLM seamlessly supports most popular open-source models on HuggingFace, including:
 - Transformer-like LLMs (e.g., Llama)

--- a/benchmarks/auto_tune.sh
+++ b/benchmarks/auto_tune.sh
+#!/bin/bash
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
+# The current server parameter combination is  max_num_seqs and max_num_batched_tokens
+# It also supports additional requirement: e2e latency and prefix cache. 
+# Pre-requisite:
+# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. 
+# 2. If the model is customized, replace the MODEL's config with the customized config.
+# 3. Set variables (ALL REQUIRED)
+#   BASE: your directory for vllm repo
+#   MODEL: the model served by vllm
+#   DOWNLOAD_DIR: directory to download and load model weights.
+#   INPUT_LEN: request input len
+#   OUTPUT_LEN: request output len
+#   MIN_CACHE_HIT_PCT: prefix cache rate
+#   MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
+# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
+# 5. The final result will be saved in RESULT file. 
+# Example use cases 
+# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
+# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
+# 3. If we want to reach 60% prefix cache, what's the best server parameter? 
+# Use INPUT_LEN=1800,  OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
+TAG=$(date +"%Y_%m_%d_%H_%M")
+BASE=""
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
+DOWNLOAD_DIR=""
+INPUT_LEN=4000
+OUTPUT_LEN=16
+MIN_CACHE_HIT_PCT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000
+LOG_FOLDER="$BASE/auto-benchmark/$TAG"
+RESULT="$LOG_FOLDER/result.txt"
+echo "result file$ $RESULT"
+echo "model: $MODEL"
+echo
+rm -rf $LOG_FOLDER
+mkdir -p $LOG_FOLDER
+cd "$BASE/vllm"
+# create sonnet-4x.txt so that we can sample 2048 tokens for input
+echo "" > benchmarks/sonnet_4x.txt
+for _ in {1..4}
+do
+cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
+done
+pip install datasets
+current_hash=$(git rev-parse HEAD)
+echo "hash:$current_hash" >> "$RESULT"
+echo "current_hash: $current_hash"
+best_throughput=0
+best_max_num_seqs=0
+best_num_batched_tokens=0
+best_goodput=0
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f $vllm_log
+    # start the server
+    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
+        --disable-log-requests \
+        --port 8004 \
+        --gpu-memory-utilization 0.98 \
+        --max-num-seqs $max_num_seqs \
+        --max-num-batched-tokens $max_num_batched_tokens \
+        --tensor-parallel-size 1 \
+        --enable-prefix-caching \
+        --load-format dummy \
+        --download-dir $DOWNLOAD_DIR \
+        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+    echo "wait for 10 minutes.."
+    echo
+    # wait for 10 minutes...
+    server_started=0
+    for i in {1..60}; do        
+        if grep -Fq "Application startup complete" "$vllm_log"; then
+            echo "Application started"
+            server_started=1
+            break
+        else
+            # echo "wait for 10 seconds..."
+            sleep 10
+        fi
+    done
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
+        echo "pkill -f vllm"
+        echo
+        pkill vllm
+        sleep 10
+        return 1
+    fi
+    echo "run benchmark test..."
+    echo
+    meet_latency_requirement=0
+    # get a basic qps by using request-rate inf
+    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    python benchmarks/benchmark_serving.py \
+        --backend vllm \
+        --model $MODEL  \
+        --dataset-name sonnet \
+        --dataset-path benchmarks/sonnet_4x.txt \
+        --sonnet-input-len $INPUT_LEN \
+        --sonnet-output-len $OUTPUT_LEN \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+        --num-prompts 100 \
+        --sonnet-prefix-len $prefix_len \
+        --port 8004 > "$bm_log"
+    through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+        meet_latency_requirement=1
+    fi
+    if (( ! meet_latency_requirement )); then
+    # start from request-rate as int(through_put) + 1
+        request_rate=$((${through_put%.*} + 1))
+        while ((request_rate > 0)); do
+            # clear prefix cache
+            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            sleep 5
+            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
+            python benchmarks/benchmark_serving.py \
+                --backend vllm \
+                --model $MODEL  \
+                --dataset-name sonnet \
+                --dataset-path benchmarks/sonnet_4x.txt \
+                --sonnet-input-len $INPUT_LEN \
+                --sonnet-output-len $OUTPUT_LEN \
+                --ignore_eos \
+                --disable-tqdm \
+                --request-rate $request_rate \
+                --percentile-metrics ttft,tpot,itl,e2el \
+                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
+                --num-prompts 100 \
+                --sonnet-prefix-len $prefix_len \
+                --port 8004 > "$bm_log"
+            through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+                meet_latency_requirement=1
+                break
+            fi
+            request_rate=$((request_rate-1))
+        done
+    fi
+    # write the results and update the best result.
+    if ((meet_latency_requirement)); then
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
+            best_throughput=$through_put
+            best_max_num_seqs=$max_num_seqs
+            best_num_batched_tokens=$max_num_batched_tokens
+            best_goodput=$goodput
+        fi
+    else
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
+    fi
+    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+    echo "pkill -f vllm"
+    echo
+    pkill vllm
+    sleep 10
+    rm -f $vllm_log
+    printf '=%.0s' $(seq 1 20)
+    return 0
+}
+num_seqs_list="128 256"
+num_batched_tokens_list="512 1024 2048 4096"
+for num_seqs in $num_seqs_list; do
+    for num_batched_tokens in $num_batched_tokens_list; do
+        run_benchmark $num_seqs $num_batched_tokens
+        exit 0
+    done
+done
+echo "finish permutations"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -12,8 +12,7 @@ from typing import Optional, Union
 import aiohttp
 import huggingface_hub.constants
 from tqdm.asyncio import tqdm
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-                          PreTrainedTokenizerFast)
 # NOTE(simon): do not import vLLM here so the benchmark script
 # can run without vLLM installed.
@@ -43,8 +42,7 @@ class RequestFuncOutput:
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
-    itl: list[float] = field(
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
-        default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
@@ -57,8 +55,9 @@ async def async_request_tgi(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(trust_env=True,
+    async with aiohttp.ClientSession(
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        params = {
            "max_new_tokens": request_func_input.output_len,
            "do_sample": True,
@@ -105,8 +104,7 @@ async def async_request_tgi(
                        # Decoding phase
                        else:
-                            output.itl.append(timestamp -
+                            output.itl.append(timestamp - most_recent_timestamp)
-                                              most_recent_timestamp)
                        most_recent_timestamp = timestamp
@@ -133,8 +131,9 @@ async def async_request_trt_llm(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")
-    async with aiohttp.ClientSession(trust_env=True,
+    async with aiohttp.ClientSession(
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        payload = {
            "accumulate_tokens": True,
            "text_input": request_func_input.prompt,
@@ -159,8 +158,7 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue
-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
-                            "data:")
                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
@@ -172,8 +170,7 @@ async def async_request_trt_llm(
                        # Decoding phase
                        else:
-                            output.itl.append(timestamp -
+                            output.itl.append(timestamp - most_recent_timestamp)
-                                              most_recent_timestamp)
                        most_recent_timestamp = timestamp
@@ -197,10 +194,11 @@ async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(trust_env=True,
+    async with aiohttp.ClientSession(
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        payload = {
+            "model": request_func_input.model,
            "prompt": request_func_input.prompt,
            "max_tokens": request_func_input.output_len,
            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
@@ -216,19 +214,21 @@ async def async_request_deepspeed_mii(
        st = time.perf_counter()
        try:
-            async with session.post(url=request_func_input.api_url,
+            async with session.post(
-                                    json=payload) as response:
+                url=request_func_input.api_url, json=payload
+            ) as response:
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
                    if "choices" in parsed_resp:
-                        output.generated_text = parsed_resp["choices"][0][
+                        output.generated_text = parsed_resp["choices"][0]["text"]
-                            "text"]
                    elif "text" in parsed_resp:
                        output.generated_text = parsed_resp["text"][0]
                    else:
-                        output.error = ("Unexpected response format: "
+                        output.error = (
-                                        "neither 'choices' nor 'text' found")
+                            "Unexpected response format: "
+                            "neither 'choices' nor 'text' found"
+                        )
                        output.success = False
                    output.success = True
                else:
@@ -249,17 +249,20 @@ async def async_request_openai_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith(
+    assert api_url.endswith(("completions", "profile")), (
-        ("completions", "profile")
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
-    async with aiohttp.ClientSession(trust_env=True,
+    async with aiohttp.ClientSession(
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        payload = {
-            "model": request_func_input.model_name \
+            "model": request_func_input.model_name
-                if request_func_input.model_name else request_func_input.model,
+            if request_func_input.model_name
+            else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
+            "repetition_penalty": 1.0,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
@@ -271,9 +274,7 @@ async def async_request_openai_completions(
            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
-        headers = {
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
@@ -282,8 +283,9 @@ async def async_request_openai_completions(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
+            async with session.post(
-                                    headers=headers) as response:
+                url=api_url, json=payload, headers=headers
+            ) as response:
                if response.status == 200:
                    first_chunk_received = False
                    async for chunk_bytes in response.content:
@@ -291,8 +293,7 @@ async def async_request_openai_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
-                            "data: ")
                        if chunk != "[DONE]":
                            data = json.loads(chunk)
@@ -312,21 +313,20 @@ async def async_request_openai_completions(
                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
+                                    output.itl.append(timestamp - most_recent_timestamp)
-                                                      most_recent_timestamp)
                                most_recent_timestamp = timestamp
                                generated_text += text or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
+                                output.output_tokens = usage.get("completion_tokens")
-                                    "completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
                        output.success = False
                        output.error = (
                            "Never received a valid chunk to calculate TTFT."
-                            "This response will be marked as failed!")
+                            "This response will be marked as failed!"
+                        )
                    output.generated_text = generated_text
                    output.latency = most_recent_timestamp - st
                else:
@@ -347,23 +347,22 @@ async def async_request_openai_chat_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith(
+    assert api_url.endswith(("chat/completions", "profile")), (
-        ("chat/completions", "profile")
+        "OpenAI Chat Completions API URL must end with 'chat/completions'."
-    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    )
-    async with aiohttp.ClientSession(trust_env=True,
+    async with aiohttp.ClientSession(
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
-            "model": request_func_input.model_name \
+            "model": request_func_input.model_name
-                if request_func_input.model_name else request_func_input.model,
+            if request_func_input.model_name
+            else request_func_input.model,
            "messages": [
-                {
+                {"role": "user", "content": content},
-                    "role": "user",
-                    "content": content
-                },
            ],
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
@@ -389,16 +388,16 @@ async def async_request_openai_chat_completions(
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload,
+            async with session.post(
-                                    headers=headers) as response:
+                url=api_url, json=payload, headers=headers
+            ) as response:
                if response.status == 200:
                    async for chunk_bytes in response.content:
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
-                            "data: ")
                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)
@@ -412,13 +411,11 @@ async def async_request_openai_chat_completions(
                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
+                                    output.itl.append(timestamp - most_recent_timestamp)
-                                                      most_recent_timestamp)
                                generated_text += content or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
+                                output.output_tokens = usage.get("completion_tokens")
-                                    "completion_tokens")
                            most_recent_timestamp = timestamp
@@ -444,25 +441,28 @@ async def async_request_openai_audio(
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
    api_url = request_func_input.api_url
-    assert api_url.endswith(
+    assert api_url.endswith(("transcriptions", "translations")), (
-        ("transcriptions", "translations"
+        "OpenAI Chat Completions API URL must end with 'transcriptions' "
-         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    )
    "or `translations`."
-    async with aiohttp.ClientSession(trust_env=True,
+    async with aiohttp.ClientSession(
-                                     timeout=AIOHTTP_TIMEOUT) as session:
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        payload = {
-            "model": request_func_input.model_name \
+            "model": request_func_input.model_name
-                if request_func_input.model_name else request_func_input.model,
+            if request_func_input.model_name
+            else request_func_input.model,
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "language": "en",
            # Flattened due to multipart/form-data
            "stream_include_usage": True,
-            "stream_continuous_usage_stats": True
+            "stream_continuous_usage_stats": True,
        }
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
@@ -477,9 +477,9 @@ async def async_request_openai_audio(
            buffer.seek(0)
            return buffer
-        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
            form = aiohttp.FormData()
-            form.add_field('file', f, content_type='audio/wav')
+            form.add_field("file", f, content_type="audio/wav")
            for key, value in payload.items():
                form.add_field(key, str(value))
@@ -491,24 +491,22 @@ async def async_request_openai_audio(
            st = time.perf_counter()
            most_recent_timestamp = st
            try:
-                async with session.post(url=api_url,
+                async with session.post(
-                                        data=form,
+                    url=api_url, data=form, headers=headers
-                                        headers=headers) as response:
+                ) as response:
                    if response.status == 200:
                        async for chunk_bytes in response.content:
                            chunk_bytes = chunk_bytes.strip()
                            if not chunk_bytes:
                                continue
-                            chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
-                                "data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()
                                data = json.loads(chunk)
                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
+                                    content = choices[0]["delta"].get("content")
-                                        "content")
                                    # First token
                                    if ttft == 0.0:
                                        ttft = timestamp - st
@@ -517,12 +515,14 @@ async def async_request_openai_audio(
                                    # Decoding phase
                                    else:
                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
+                                            timestamp - most_recent_timestamp
+                                        )
                                    generated_text += content or ""
                                elif usage := data.get("usage"):
                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
+                                        "completion_tokens"
+                                    )
                                most_recent_timestamp = timestamp
@@ -543,7 +543,7 @@ async def async_request_openai_audio(
 def get_model(pretrained_model_name_or_path: str) -> str:
-    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
        from modelscope import snapshot_download
        from vllm.model_executor.model_loader.weight_utils import get_lock
@@ -554,7 +554,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
            model_path = snapshot_download(
                model_id=pretrained_model_name_or_path,
                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+            )
            return model_path
    return pretrained_model_name_or_path
@@ -567,23 +568,23 @@ def get_tokenizer(
    **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
-            pretrained_model_name_or_path):
+        pretrained_model_name_or_path
-        pretrained_model_name_or_path = get_model(
+    ):
-            pretrained_model_name_or_path)
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):
-            raise ValueError(
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
-                "Cannot use the fast tokenizer in slow tokenizer mode.")
        kwargs["use_fast"] = False
    if tokenizer_mode == "mistral":
        try:
            from vllm.transformers_utils.tokenizer import MistralTokenizer
        except ImportError as e:
-            raise ImportError("MistralTokenizer requires vllm package.\n"
+            raise ImportError(
-                              "Please install it with `pip install vllm` "
+                "MistralTokenizer requires vllm package.\n"
-                              "to use mistral tokenizer mode.") from e
+                "Please install it with `pip install vllm` "
-        return MistralTokenizer.from_pretrained(
+                "to use mistral tokenizer mode."
-            str(pretrained_model_name_or_path))
+            ) from e
+        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
    else:
        return AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path,
@@ -606,7 +607,7 @@ ASYNC_REQUEST_FUNCS = {
 }
 OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    k
-    if v in (async_request_openai_completions,
+    for k, v in ASYNC_REQUEST_FUNCS.items()
-             async_request_openai_chat_completions)
+    if v in (async_request_openai_completions, async_request_openai_chat_completions)
 ]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -82,14 +82,12 @@ class BenchmarkDataset(ABC):
        self.dataset_path = dataset_path
        # Set the random seed, ensuring that a None value is replaced with the
        # default seed.
-        self.random_seed = (random_seed
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
-                            if random_seed is not None else self.DEFAULT_SEED)
        self.data = None
    def apply_multimodal_chat_transformation(
-            self,
+        self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
-            prompt: str,
+    ) -> list[dict]:
-            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
        This method is used for chat models that expect a specific conversation
@@ -111,8 +109,7 @@ class BenchmarkDataset(ABC):
            NotImplementedError: If a subclass does not implement this method.
        """
        # TODO (jenniferzhao): add support for downloading data
-        raise NotImplementedError(
+        raise NotImplementedError("load_data must be implemented in subclasses.")
-            "load_data must be implemented in subclasses.")
    def get_random_lora_request(
        self,
@@ -158,8 +155,9 @@ class BenchmarkDataset(ABC):
        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
    @abstractmethod
-    def sample(self, tokenizer: PreTrainedTokenizerBase,
+    def sample(
-               num_requests: int) -> list[SampleRequest]:
+        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+    ) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.
@@ -177,8 +175,9 @@ class BenchmarkDataset(ABC):
        """
        raise NotImplementedError("sample must be implemented in subclasses.")
-    def maybe_oversample_requests(self, requests: list[SampleRequest],
+    def maybe_oversample_requests(
-                                  num_requests: int) -> None:
+        self, requests: list[SampleRequest], num_requests: int
+    ) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
        number.
@@ -189,11 +188,9 @@ class BenchmarkDataset(ABC):
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
-            additional = random.choices(requests,
+            additional = random.choices(requests, k=num_requests - len(requests))
-                                        k=num_requests - len(requests))
            requests.extend(additional)
-            logger.info("Oversampled requests to reach %d total samples.",
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)
-                        num_requests)
 # -----------------------------------------------------------------------------
@@ -218,14 +215,14 @@ def is_valid_sequence(
    """
    # Check for invalid conditions
    prompt_too_short = prompt_len < min_len
-    output_too_short = (not skip_min_output_len_check) and (output_len
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
-                                                            < min_len)
    prompt_too_long = prompt_len > max_prompt_len
    combined_too_long = (prompt_len + output_len) > max_total_len
    # Return True if none of the invalid conditions are met
-    return not (prompt_too_short or output_too_short or prompt_too_long
+    return not (
-                or combined_too_long)
+        prompt_too_short or output_too_short or prompt_too_long or combined_too_long
+    )
 @cache
@@ -257,28 +254,28 @@ def process_image(image: Any) -> Mapping[str, Any]:
    Raises:
        ValueError: If the input is not a supported type.
    """
-    if isinstance(image, dict) and 'bytes' in image:
+    if isinstance(image, dict) and "bytes" in image:
-        image = Image.open(BytesIO(image['bytes']))
+        image = Image.open(BytesIO(image["bytes"]))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
-            image_base64 = base64.b64encode(
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
-                image_data.getvalue()).decode("utf-8")
        return {
            "type": "image_url",
-            "image_url": {
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
-                "url": f"data:image/jpeg;base64,{image_base64}"
-            },
        }
    if isinstance(image, str):
-        image_url = (image if image.startswith(
+        image_url = (
-            ("http://", "file://")) else f"file://{image}")
+            image if image.startswith(("http://", "file://")) else f"file://{image}"
+        )
        return {"type": "image_url", "image_url": {"url": image_url}}
-    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+    raise ValueError(
-                     " or str or dictionary with raw image bytes.")
+        f"Invalid image input {image}. Must be a PIL.Image.Image"
+        " or str or dictionary with raw image bytes."
+    )
 # -----------------------------------------------------------------------------
@@ -315,42 +312,56 @@ class RandomDataset(BenchmarkDataset):
        )
        vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
-        prefix_token_ids = (np.random.randint(
+        prefix_token_ids = (
-            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+            np.random.randint(0, vocab_size, size=prefix_len).tolist()
+            if prefix_len > 0
+            else []
+        )
        # New sampling logic: [X * (1 - b), X * (1 + b)]
-        input_low = int(input_len * (1 - range_ratio))
+        input_low = int(real_input_len * (1 - range_ratio))
-        input_high = int(input_len * (1 + range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
        output_low = int(output_len * (1 - range_ratio))
        output_high = int(output_len * (1 + range_ratio))
        # Add logging for debugging
        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
-        logger.info("Sampling output_len from [%s, %s]", output_low,
+        logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
-                    output_high)
+        input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
-        input_lens = np.random.randint(input_low,
+        output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
-                                       input_high + 1,
-                                       size=num_requests)
-        output_lens = np.random.randint(output_low,
-                                        output_high + 1,
-                                        size=num_requests)
        offsets = np.random.randint(0, vocab_size, size=num_requests)
        requests = []
        for i in range(num_requests):
-            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+            inner_seq = (
-                         vocab_size).tolist()
+                (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
+            ).tolist()
            token_sequence = prefix_token_ids + inner_seq
            prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
+                : input_lens[i]
+            ]
+            prompt = tokenizer.decode(re_encoded_sequence)
            total_input_len = prefix_len + int(input_lens[i])
            requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=total_input_len,
                    expected_output_len=int(output_lens[i]),
-                ))
+                )
+            )
        return requests
@@ -377,7 +388,8 @@ class ShareGPTDataset(BenchmarkDataset):
            self.data = json.load(f)
        # Filter entries with at least two conversation turns.
        self.data = [
-            entry for entry in self.data
+            entry
+            for entry in self.data
            if "conversations" in entry and len(entry["conversations"]) >= 2
        ]
        random.seed(self.random_seed)
@@ -403,27 +415,28 @@ class ShareGPTDataset(BenchmarkDataset):
            )
            lora_request, tokenizer = self.get_random_lora_request(
-                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
            prompt_len = len(prompt_ids)
-            new_output_len = (len(completion_ids)
+            new_output_len = len(completion_ids) if output_len is None else output_len
-                              if output_len is None else output_len)
+            if not is_valid_sequence(
-            if not is_valid_sequence(prompt_len,
+                prompt_len,
-                                     new_output_len,
+                new_output_len,
-                                     skip_min_output_len_check=output_len
+                skip_min_output_len_check=output_len is not None,
-                                     is not None):
+            ):
                continue
            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
-                    prompt, None)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
-                ))
+                )
+            )
        self.maybe_oversample_requests(samples, num_requests)
        return samples
@@ -469,20 +482,20 @@ class SonnetDataset(BenchmarkDataset):
    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
-        avg_len = sum(len(tokens)
+        avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
-                      for tokens in tokenized_lines) / len(tokenized_lines)
        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
        base_msg = [{"role": "user", "content": base_prompt}]
-        base_fmt = tokenizer.apply_chat_template(base_msg,
+        base_fmt = tokenizer.apply_chat_template(
-                                                 add_generation_prompt=True,
+            base_msg, add_generation_prompt=True, tokenize=False
-                                                 tokenize=False)
+        )
        base_offset = len(tokenizer(base_fmt).input_ids)
        if input_len <= base_offset:
            raise ValueError(
                f"'input_len' must be higher than the base prompt length "
-                f"({base_offset}).")
+                f"({base_offset})."
+            )
        # Determine how many poem lines to use.
        num_input_lines = round((input_len - base_offset) / avg_len)
@@ -491,21 +504,23 @@ class SonnetDataset(BenchmarkDataset):
        samples = []
        while len(samples) < num_requests:
-            extra_lines = random.choices(self.data,
+            extra_lines = random.choices(
-                                         k=num_input_lines - num_prefix_lines)
+                self.data, k=num_input_lines - num_prefix_lines
+            )
            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
            msg = [{"role": "user", "content": prompt}]
            prompt_formatted = tokenizer.apply_chat_template(
-                msg, add_generation_prompt=True, tokenize=False)
+                msg, add_generation_prompt=True, tokenize=False
+            )
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
            if prompt_len <= input_len:
                samples.append(
                    SampleRequest(
-                        prompt=prompt_formatted
+                        prompt=prompt_formatted if return_prompt_formatted else prompt,
-                        if return_prompt_formatted else prompt,
                        prompt_len=prompt_len,
                        expected_output_len=output_len,
-                    ))
+                    )
+                )
        return samples
@@ -525,7 +540,9 @@ class BurstGPTDataset(BenchmarkDataset):
        super().__init__(**kwargs)
        self.load_data()
-    def load_data(self, ):
+    def load_data(
+        self,
+    ):
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")
@@ -539,8 +556,7 @@ class BurstGPTDataset(BenchmarkDataset):
    def _sample_loaded_data(self, num_requests: int) -> list:
        if num_requests <= len(self.data):
-            data = self.data.sample(n=num_requests,
+            data = self.data.sample(n=num_requests, random_state=self.random_seed)
-                                    random_state=self.random_seed)
        else:
            data = self.data.sample(
                n=num_requests,
@@ -564,7 +580,8 @@ class BurstGPTDataset(BenchmarkDataset):
            input_len = int(data[i][2])
            output_len = int(data[i][3])
            lora_req, tokenizer = self.get_random_lora_request(
-                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
            vocab_size = tokenizer.vocab_size
            # Generate a synthetic prompt: a list of token IDs computed as (i +
            # j) modulo vocab_size.
@@ -576,7 +593,8 @@ class BurstGPTDataset(BenchmarkDataset):
                    prompt_len=input_len,
                    expected_output_len=output_len,
                    lora_request=lora_req,
-                ))
+                )
+            )
        return samples
@@ -619,20 +637,23 @@ class HuggingFaceDataset(BenchmarkDataset):
 class ConversationDataset(HuggingFaceDataset):
    """Dataset for conversation data with multimodal support."""
    SUPPORTED_DATASET_PATHS = {
-        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+        "lmms-lab/LLaVA-OneVision-Data",
+        "Aeala/ShareGPT_Vicuna_unfiltered",
    }
    IS_MULTIMODAL = True
-    def sample(self,
+    def sample(
-               tokenizer: PreTrainedTokenizerBase,
+        self,
-               num_requests: int,
+        tokenizer: PreTrainedTokenizerBase,
-               output_len: Optional[int] = None,
+        num_requests: int,
-               enable_multimodal_chat: bool = False,
+        output_len: Optional[int] = None,
-               **kwargs) -> list:
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
        # Filter examples with at least 2 conversations
-        filtered_data = self.data.filter(
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
-            lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None
@@ -648,24 +669,22 @@ class ConversationDataset(HuggingFaceDataset):
            completion_len = len(completion_ids)
            output_len = completion_len if dynamic_output else output_len
            assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
-                    prompt_len, completion_len):
                continue
-            mm_content = process_image(
+            mm_content = process_image(item["image"]) if "image" in item else None
-                item["image"]) if "image" in item else None
            if enable_multimodal_chat:
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len and output len
-                prompt = self.apply_multimodal_chat_transformation(
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
-                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                ))
+                )
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
@@ -682,10 +701,8 @@ class VisionArenaDataset(HuggingFaceDataset):
    DEFAULT_OUTPUT_LEN = 128
    SUPPORTED_DATASET_PATHS = {
-        "lmarena-ai/VisionArena-Chat":
+        "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
-        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
-        "lmarena-ai/vision-arena-bench-v0.1":
-        lambda x: x["turns"][0][0]["content"]
    }
    IS_MULTIMODAL = True
@@ -697,16 +714,14 @@ class VisionArenaDataset(HuggingFaceDataset):
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
-        output_len = (output_len
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
            if parser_fn is None:
-                raise ValueError(
+                raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
-                    f"Unsupported dataset path: {self.dataset_path}")
            prompt = parser_fn(item)
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
@@ -714,15 +729,15 @@ class VisionArenaDataset(HuggingFaceDataset):
                # Note: when chat is enabled the request prompt_len is no longer
                # accurate and we will be using request output to count the
                # actual prompt len
-                prompt = self.apply_multimodal_chat_transformation(
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
-                    prompt, mm_content)
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                ))
+                )
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
@@ -747,14 +762,15 @@ class InstructCoderDataset(HuggingFaceDataset):
        "likaixin/InstructCoder",
    }
-    def sample(self,
+    def sample(
-               tokenizer: PreTrainedTokenizerBase,
+        self,
-               num_requests: int,
+        tokenizer: PreTrainedTokenizerBase,
-               output_len: Optional[int] = None,
+        num_requests: int,
-               enable_multimodal_chat: bool = False,
+        output_len: Optional[int] = None,
-               **kwargs) -> list:
+        enable_multimodal_chat: bool = False,
-        output_len = (output_len
+        **kwargs,
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
        sampled_requests = []
        for item in self.data:
            if len(sampled_requests) >= num_requests:
@@ -766,7 +782,63 @@ class InstructCoderDataset(HuggingFaceDataset):
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
-                ))
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
@@ -780,23 +852,27 @@ class AIMODataset(HuggingFaceDataset):
    """
    Dataset class for processing a AIMO dataset with reasoning questions.
    """
    SUPPORTED_DATASET_PATHS = {
-        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/aimo-validation-aime",
-        "AI-MO/NuminaMath-CoT"
+        "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT",
    }
-    def sample(self,
+    def sample(
-               tokenizer: PreTrainedTokenizerBase,
+        self,
-               num_requests: int,
+        tokenizer: PreTrainedTokenizerBase,
-               output_len: Optional[int] = None,
+        num_requests: int,
-               **kwargs) -> list:
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
        sampled_requests = []
        dynamic_output = output_len is None
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            prompt, completion = item['problem'], item["solution"]
+            prompt, completion = item["problem"], item["solution"]
            prompt_ids = tokenizer(prompt).input_ids
            completion_ids = tokenizer(completion).input_ids
@@ -804,10 +880,9 @@ class AIMODataset(HuggingFaceDataset):
            completion_len = len(completion_ids)
            output_len = completion_len if dynamic_output else output_len
            assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(prompt_len,
+            if dynamic_output and not is_valid_sequence(
-                                                        completion_len,
+                prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
-                                                        max_prompt_len=2048,
+            ):
-                                                        max_total_len=32000):
                continue
            sampled_requests.append(
                SampleRequest(
@@ -815,11 +890,100 @@ class AIMODataset(HuggingFaceDataset):
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=None,
-                ))
+                )
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
+# -----------------------------------------------------------------------------
+# Next Edit Prediction Dataset Implementation
+# -----------------------------------------------------------------------------
+zeta_prompt = """### Instruction:
+You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
+### User Edits:
+{}
+### User Excerpt:
+{}
+### Response:
+"""  # noqa: E501
+def _format_zeta_prompt(
+    sample: dict, original_start_marker: str = "<|editable_region_start|>"
+) -> dict:
+    """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
+    further extended to support more NEP datasets.
+    Args:
+        sample: The dataset sample containing events,
+            inputs, and outputs.
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
+            "<|editable_region_start|>".
+    Returns:
+        A dictionary with the formatted prompts and expected outputs.
+    """
+    events = sample["events"]
+    input = sample["input"]
+    output = sample["output"]
+    prompt = zeta_prompt.format(events, input)
+    # following the original implementation, extract the focused region
+    # from the raw output
+    output_start_index = output.find(original_start_marker)
+    output_focused_region = output[output_start_index:]
+    expected_output = output_focused_region
+    return {"prompt": prompt, "expected_output": expected_output}
+class NextEditPredictionDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a Next Edit Prediction dataset.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "zed-industries/zeta",
+    }
+    MAPPING_PROMPT_FUNCS = {
+        "zed-industries/zeta": _format_zeta_prompt,
+    }
+    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
+        if formatting_prompt_func is None:
+            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+        samples = []
+        for sample in self.data:
+            sample = formatting_prompt_func(sample)
+            samples.append(
+                SampleRequest(
+                    prompt=sample["prompt"],
+                    prompt_len=len(tokenizer(sample["prompt"]).input_ids),
+                    expected_output_len=len(
+                        tokenizer(sample["expected_output"]).input_ids
+                    ),
+                )
+            )
+            if len(samples) >= num_requests:
+                break
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
 # -----------------------------------------------------------------------------
 # ASR Dataset Implementation
 # -----------------------------------------------------------------------------
@@ -842,18 +1006,22 @@ class ASRDataset(HuggingFaceDataset):
    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
    +----------------+----------------------------------------+--------------------------+-----------------------------+
-    """ # noqa: E501
+    """  # noqa: E501
    SUPPORTED_DATASET_PATHS = {
-        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
+        "openslr/librispeech_asr",
-        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
    }
    DEFAULT_OUTPUT_LEN = 128
    IS_MULTIMODAL = True
    # TODO Whisper-specific. Abstract interface when more models are supported.
-    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
-                              "<|notimestamps|>"
    skip_long_audios: bool = True
    def sample(
@@ -864,8 +1032,8 @@ class ASRDataset(HuggingFaceDataset):
        **kwargs,
    ) -> list:
        import librosa
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
        prompt_len = len(tokenizer(prompt).input_ids)
        sampled_requests = []
@@ -888,10 +1056,14 @@ class ASRDataset(HuggingFaceDataset):
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
-                ))
+                )
+            )
        if skipped:
-            logger.warning("%d samples discarded from dataset due to" \
+            logger.warning(
-                           " their length being greater than" \
+                "%d samples discarded from dataset due to"
-                           " what Whisper supports.", skipped)
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,9 +11,9 @@ from typing import Any, Optional
 import numpy as np
 import torch
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
@@ -21,13 +21,14 @@ from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+def save_to_pytorch_benchmark_format(
-                                     results: dict[str, Any]) -> None:
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={"latency": results["latencies"]},
-        extra_info={k: results[k]
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
-                    for k in ["avg_latency", "percentiles"]})
+    )
    if pt_records:
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)
@@ -42,9 +43,11 @@ def main(args: argparse.Namespace):
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))
    assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len +
+        args.input_len + args.output_len
-        args.output_len), ("Please ensure that max_model_len is greater than"
+    ), (
-                           " the sum of input_len and output_len.")
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )
    sampling_params = SamplingParams(
        n=args.n,
@@ -55,18 +58,16 @@ def main(args: argparse.Namespace):
        detokenize=not args.disable_detokenize,
    )
    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(10000,
+    dummy_prompt_token_ids = np.random.randint(
-                                               size=(args.batch_size,
+        10000, size=(args.batch_size, args.input_len)
-                                                     args.input_len))
+    )
-    dummy_prompts: list[PromptType] = [{
+    dummy_prompts: list[PromptType] = [
-        "prompt_token_ids": batch
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    } for batch in dummy_prompt_token_ids.tolist()]
+    ]
    def llm_generate():
        if not args.use_beam_search:
-            llm.generate(dummy_prompts,
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
        else:
            llm.beam_search(
                dummy_prompts,
@@ -80,12 +81,13 @@ def main(args: argparse.Namespace):
    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
-                    activities=[
+                activities=[
-                        torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
+                    torch.profiler.ProfilerActivity.CUDA,
-                    ],
+                ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir)),
+                    str(profile_dir)
+                ),
            ) as p:
                llm_generate()
            print(p.key_averages().table(sort_by="self_cuda_time_total"))
@@ -103,8 +105,9 @@ def main(args: argparse.Namespace):
    if args.profile:
        profile_dir = args.profile_result_dir
        if not profile_dir:
-            profile_dir = (Path(".") / "vllm_benchmark_result" /
+            profile_dir = (
-                           f"latency_result_{time.time()}")
+                Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            )
        print(f"Profiling (results will be saved to '{profile_dir}')...")
        run_to_completion(profile_dir=profile_dir)
        return
@@ -135,7 +138,8 @@ def main(args: argparse.Namespace):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description="Benchmark the latency of processing a single batch of "
-        "requests till completion.")
+        "requests till completion."
+    )
    parser.add_argument("--input-len", type=int, default=32)
    parser.add_argument("--output-len", type=int, default=128)
    parser.add_argument("--batch-size", type=int, default=8)
@@ -152,10 +156,9 @@ if __name__ == "__main__":
        default=10,
        help="Number of iterations to run for warmup.",
    )
-    parser.add_argument("--num-iters",
+    parser.add_argument(
-                        type=int,
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
-                        default=30,
+    )
-                        help="Number of iterations to run.")
    parser.add_argument(
        "--profile",
        action="store_true",
@@ -165,8 +168,10 @@ if __name__ == "__main__":
        "--profile-result-dir",
        type=str,
        default=None,
-        help=("path to save the pytorch profiler output. Can be visualized "
+        help=(
-              "with ui.perfetto.dev or Tensorboard."),
+            "path to save the pytorch profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
    )
    parser.add_argument(
        "--output-json",
@@ -177,8 +182,10 @@ if __name__ == "__main__":
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
-        help=("Do not detokenize responses (i.e. do not include "
+        help=(
-              "detokenization time in the latency measurement)"),
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )
    parser = EngineArgs.add_cli_args(parser)

--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -76,7 +76,7 @@ def repeat_prompts(prompts, repeat_count, mode: str):
            - 'random': Shuffle the prompts randomly after repetition.
            - 'tile': Repeat the entire prompt list in sequence.
              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
-            - 'interleave': Repeat each prompt consecutively before moving to 
+            - 'interleave': Repeat each prompt consecutively before moving to
              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
    Returns:
@@ -86,20 +86,21 @@ def repeat_prompts(prompts, repeat_count, mode: str):
        ValueError: If an invalid mode is provided.
    """
    print("Repeat mode: ", mode)
-    if mode == 'random':
+    if mode == "random":
        repeated_prompts = prompts * repeat_count
        random.shuffle(repeated_prompts)
        return repeated_prompts
-    elif mode == 'tile':
+    elif mode == "tile":
        return prompts * repeat_count
-    elif mode == 'interleave':
+    elif mode == "interleave":
        repeated_prompts = []
        for prompt in prompts:
            repeated_prompts.extend([prompt] * repeat_count)
        return repeated_prompts
    else:
-        raise ValueError(f"Invalid mode: {mode}, only support "
+        raise ValueError(
-                         "'random', 'tile', 'interleave'")
+            f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
+        )
 def main(args):
@@ -109,16 +110,16 @@ def main(args):
    # we append the document id at the beginning to avoid any of the document
    # being the prefix of other documents
    prompts = [
-        str(i) + ' '.join(['hi'] * args.document_length)
+        str(i) + " ".join(["hi"] * args.document_length)
        for i in range(args.num_documents)
    ]
    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
    warmup_prompts = [
-        "This is warm up request " + str(i) + \
+        "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
-                ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)
-        for i in range(args.num_documents)]
+    ]
    # Create the LLM engine
    engine_args = EngineArgs.from_cli_args(args)
@@ -142,42 +143,52 @@ def main(args):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description=
+        description="Benchmark the performance with or "
-        'Benchmark the performance with or without automatic prefix caching.')
+        "without automatic prefix caching."
+    )
    parser.add_argument(
-        '--document-length',
+        "--document-length",
        type=int,
        # Roughly the number of tokens for a system paper,
        # excluding images
        default=20000,
-        help='Range of input lengths for sampling prompts,'
+        help="Range of input lengths for sampling prompts, "
-        'specified as "min:max" (e.g., "128:256").')
+        'specified as "min:max" (e.g., "128:256").',
+    )
-    parser.add_argument('--num-documents',
-                        type=int,
+    parser.add_argument(
-                        default=8,
+        "--num-documents",
-                        help='Range of input lengths for sampling prompts,'
+        type=int,
-                        'specified as "min:max" (e.g., "128:256").')
+        default=8,
+        help="Range of input lengths for sampling prompts, "
-    parser.add_argument('--output-len', type=int, default=10)
+        'specified as "min:max" (e.g., "128:256").',
+    )
-    parser.add_argument('--repeat-count',
-                        type=int,
+    parser.add_argument("--output-len", type=int, default=10)
-                        default=2,
-                        help='Number of times to repeat each prompt')
+    parser.add_argument(
+        "--repeat-count",
-    parser.add_argument("--repeat-mode",
+        type=int,
-                        type=str,
+        default=2,
-                        default='random',
+        help="Number of times to repeat each prompt",
-                        help='The mode to repeat prompts. The supported '
+    )
-                        'modes are "random", "tile", and "interleave". '
-                        'See repeat_prompts() in the source code for details.')
+    parser.add_argument(
+        "--repeat-mode",
-    parser.add_argument("--shuffle-seed",
+        type=str,
-                        type=int,
+        default="random",
-                        default=0,
+        help="The mode to repeat prompts. The supported "
-                        help='Random seed when the repeat mode is "random"')
+        'modes are "random", "tile", and "interleave". '
+        "See repeat_prompts() in the source code for details.",
+    )
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=0,
+        help='Random seed when the repeat mode is "random"',
+    )
    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()

--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -63,8 +63,7 @@ class Request:
    output_len: int
-def sample_tokens(tokenizer: PreTrainedTokenizerBase,
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
-                  length: int) -> list[int]:
    vocab = tokenizer.get_vocab()
    all_special_ids = set(tokenizer.all_special_ids)
@@ -91,8 +90,10 @@ def sample_requests_from_dataset(
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
+    dataset = [
-                data["conversations"][1]["value"]) for data in dataset]
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
    # Shuffle the dataset.
    random.shuffle(dataset)
@@ -113,8 +114,9 @@ def sample_requests_from_dataset(
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
-        output_len = (len(completion_token_ids)
+        output_len = (
-                      if fixed_output_len is None else fixed_output_len)
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
        if min_len <= prompt_len <= max_len:
            filtered_requests.append(Request(prompt, prompt_len, output_len))
@@ -128,27 +130,27 @@ def sample_requests_from_random(
    fixed_output_len: Optional[int],
    prefix_len: int,
 ) -> list[Request]:
    requests = []
    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
    min_len, max_len = input_length_range
    for i in range(num_requests):
        unique_part_token_ids = sample_tokens(
-            tokenizer,
+            tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
-            random.randint(min_len - prefix_len, max_len - prefix_len))
+        )
        prompt_token_ids = prefix_token_ids + unique_part_token_ids
        prompt = tokenizer.decode(prompt_token_ids)
        prompt_len = len(prompt_token_ids)
-        assert (min_len <= prompt_len <= max_len
+        assert min_len <= prompt_len <= max_len, (
-                ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+            f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        )
        requests.append(Request(prompt, prompt_len, fixed_output_len))
    return requests
-def repeat_and_sort_requests(requests: list[Request],
+def repeat_and_sort_requests(
-                             repeat_count: int,
+    requests: list[Request], repeat_count: int, sort: bool = False
-                             sort: bool = False) -> list[str]:
+) -> list[str]:
    repeated_requests = requests * repeat_count
    if sort:
        repeated_requests.sort(key=lambda x: x[1])
@@ -159,14 +161,14 @@ def repeat_and_sort_requests(requests: list[Request],
 def main(args):
    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
-    input_length_range = tuple(map(int, args.input_length_range.split(':')))
+    input_length_range = tuple(map(int, args.input_length_range.split(":")))
    random.seed(args.seed)
    if args.dataset_path is not None:
        if args.prefix_len > 0:
-            raise ValueError("prefix-len is not supported when "
+            raise ValueError(
-                             "dataset-path is provided.")
+                "prefix-len is not supported when dataset-path is provided."
-        print(f"Start to sample {args.num_prompts} prompts "
+            )
-              f"from {args.dataset_path}")
+        print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
        filtered_requests = sample_requests_from_dataset(
            dataset_path=args.dataset_path,
            num_requests=args.num_prompts,
@@ -196,14 +198,16 @@ def main(args):
    llm = LLM(**dataclasses.asdict(engine_args))
-    sampling_params = SamplingParams(temperature=0,
+    sampling_params = SamplingParams(
-                                     max_tokens=args.output_len,
+        temperature=0,
-                                     detokenize=not args.disable_detokenize)
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
    print("Testing filtered requests")
-    prompts = repeat_and_sort_requests(filtered_requests,
+    prompts = repeat_and_sort_requests(
-                                       repeat_count=args.repeat_count,
+        filtered_requests, repeat_count=args.repeat_count, sort=args.sort
-                                       sort=args.sort)
+    )
    print("------start generating------")
    test_prefix(
@@ -215,29 +219,35 @@ def main(args):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description=
+        description="Benchmark the performance with or without "
-        'Benchmark the performance with or without automatic prefix caching.')
+        "automatic prefix caching."
-    parser.add_argument("--dataset-path",
+    )
-                        type=str,
+    parser.add_argument(
-                        default=None,
+        "--dataset-path", type=str, default=None, help="Path to the dataset."
-                        help="Path to the dataset.")
+    )
-    parser.add_argument('--output-len', type=int, default=10)
+    parser.add_argument("--output-len", type=int, default=10)
-    parser.add_argument('--num-prompts',
+    parser.add_argument(
-                        type=int,
+        "--num-prompts",
-                        required=True,
+        type=int,
-                        help="Number of the prompts sampled from dataset")
+        required=True,
-    parser.add_argument('--repeat-count',
+        help="Number of the prompts sampled from dataset",
-                        type=int,
+    )
-                        default=1,
+    parser.add_argument(
-                        help='Number of times to repeat each prompt')
+        "--repeat-count",
-    parser.add_argument('--sort',
+        type=int,
-                        action='store_true',
+        default=1,
-                        help='Sort prompts by input length')
+        help="Number of times to repeat each prompt",
-    parser.add_argument('--input-length-range',
+    )
-                        type=str,
+    parser.add_argument(
-                        required=True,
+        "--sort", action="store_true", help="Sort prompts by input length"
-                        help='Range of input lengths for sampling prompts,'
+    )
-                        'specified as "min:max" (e.g., "128:256").')
+    parser.add_argument(
+        "--input-length-range",
+        type=str,
+        required=True,
+        help="Range of input lengths for sampling prompts,"
+        'specified as "min:max" (e.g., "128:256").',
+    )
    parser.add_argument(
        "--prefix-len",
        type=int,
@@ -248,10 +258,12 @@ if __name__ == "__main__":
        "when dataset-path is not provided.",
    )
    parser.add_argument(
-        '--disable-detokenize',
+        "--disable-detokenize",
-        action='store_true',
+        action="store_true",
-        help=("Do not detokenize responses (i.e. do not include "
+        help=(
-              "detokenization time in the latency measurement)"),
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )
    parser = EngineArgs.add_cli_args(parser)

--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark offline prioritization."""
 import argparse
 import dataclasses
 import json
@@ -13,7 +14,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.utils import FlexibleArgumentParser
-#Select a equi-probable random priority
+# Select a equi-probable random priority
 def get_random_flag():
    return 0 if random.random() < 0.5 else 1
@@ -33,8 +34,10 @@ def sample_requests(
    # Filter out the conversations with less than 2 turns.
    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
+    dataset = [
-                data["conversations"][1]["value"]) for data in dataset]
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
    # Shuffle the dataset.
    random.shuffle(dataset)
@@ -51,8 +54,9 @@ def sample_requests(
        completion = dataset[i][1]
        completion_token_ids = tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
+        output_len = (
-                         ) if fixed_output_len is None else fixed_output_len
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
        if prompt_len < 4 or output_len < 4:
            # Prune too short sequences.
            continue
@@ -74,13 +78,16 @@ def run_vllm(
    disable_detokenize: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
-        for request in requests), (
+        for request in requests
-            "Please ensure that max_model_len is greater than the sum of"
+    ), (
-            " input_len and output_len for all requests.")
+        "Please ensure that max_model_len is greater than the sum of"
+        " input_len and output_len for all requests."
+    )
    # Add the requests to the engine.
    prompts = []
@@ -97,7 +104,8 @@ def run_vllm(
                ignore_eos=True,
                max_tokens=output_len,
                detokenize=not disable_detokenize,
-            ))
+            )
+        )
    start = time.perf_counter()
    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
@@ -111,26 +119,33 @@ def main(args: argparse.Namespace):
    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
        prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len,
+        requests = [
-                     get_random_flag()) for _ in range(args.num_prompts)]
+            (prompt, args.input_len, args.output_len, get_random_flag())
+            for _ in range(args.num_prompts)
+        ]
    else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+        requests = sample_requests(
-                                   args.output_len)
+            args.dataset, args.num_prompts, tokenizer, args.output_len
+        )
    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.n,
+        elapsed_time = run_vllm(
-                                EngineArgs.from_cli_args(args),
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
-                                args.disable_detokenize)
+        )
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
+    total_num_tokens = sum(
-                           for _, prompt_len, output_len, priority in requests)
+        prompt_len + output_len for _, prompt_len, output_len, priority in requests
-    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+    )
-          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} tokens/s"
+    )
    # Output JSON results if specified
    if args.output_json:
@@ -147,41 +162,44 @@ def main(args: argparse.Namespace):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii"],
-                        default="vllm")
-    parser.add_argument("--dataset",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset.")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=200,
-                        help="Number of prompts to process.")
    parser.add_argument(
-        '--output-json',
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=200, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--output-json",
        type=str,
        default=None,
-        help='Path to save the throughput results in JSON format.')
+        help="Path to save the throughput results in JSON format.",
+    )
    parser.add_argument(
-        '--disable-detokenize',
+        "--disable-detokenize",
-        action='store_true',
+        action="store_true",
-        help=("Do not detokenize responses (i.e. do not include "
+        help=(
-              "detokenization time in the latency measurement)"),
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )
    parser = EngineArgs.add_cli_args(parser)

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -20,6 +20,7 @@ On the client side, run:
        --endpoint /generate_stream
    to the end of the command above.
 """
 import argparse
 import asyncio
 import gc
@@ -34,12 +35,16 @@ from datetime import datetime
 from typing import Any, Optional
 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
-                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
-                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
@@ -50,11 +55,21 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
-from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
+from benchmark_dataset import (
-                               ConversationDataset, HuggingFaceDataset,
+    AIMODataset,
-                               InstructCoderDataset, RandomDataset,
+    ASRDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
+    BurstGPTDataset,
-                               VisionArenaDataset)
+    ConversationDataset,
+    HuggingFaceDataset,
+    InstructCoderDataset,
+    MTBenchDataset,
+    NextEditPredictionDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -117,7 +132,8 @@ async def get_request(
    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}.")
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
    theta = 1.0 / (request_rate * burstiness)
    for request in input_requests:
@@ -163,8 +179,10 @@ def calculate_metrics(
                # bundled together
                # Note : this may inflate the output token count slightly
                output_len = len(
-                    tokenizer(outputs[i].generated_text,
+                    tokenizer(
-                              add_special_tokens=False).input_ids)
+                        outputs[i].generated_text, add_special_tokens=False
+                    ).input_ids
+                )
            actual_output_lens.append(output_len)
            total_input += input_requests[i].prompt_len
            tpot = 0
@@ -187,16 +205,19 @@ def calculate_metrics(
        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
-            slo_values.append(goodput_config_dict["ttft"] /
+            slo_values.append(
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
-            slo_values.append(goodput_config_dict["tpot"] /
+            slo_values.append(
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
-            slo_values.append(goodput_config_dict["e2el"] /
+            slo_values.append(
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        for req_metric in zip(*valid_metrics):
            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -207,7 +228,8 @@ def calculate_metrics(
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
            "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
@@ -216,27 +238,31 @@ def calculate_metrics(
        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0) *
+        mean_ttft_ms=np.mean(ttfts or 0)
-        1000,  # ttfts is empty if streaming is not supported by backend
+        * 1000,  # ttfts is empty if streaming is not supported by backend
        std_ttft_ms=np.std(ttfts or 0) * 1000,
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+        percentiles_ttft_ms=[
-                             for p in selected_percentiles],
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        std_tpot_ms=np.std(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+        percentiles_tpot_ms=[
-                             for p in selected_percentiles],
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_itl_ms=np.mean(itls or 0) * 1000,
        std_itl_ms=np.std(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+        percentiles_itl_ms=[
-                            for p in selected_percentiles],
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+        percentiles_e2el_ms=[
-                             for p in selected_percentiles],
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
    )
    return metrics, actual_output_lens
@@ -269,10 +295,12 @@ async def benchmark(
        raise ValueError(f"Unknown backend: {backend}")
    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = \
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0].prompt, input_requests[0].prompt_len, \
+        input_requests[0].prompt,
-        input_requests[0].expected_output_len, \
+        input_requests[0].prompt_len,
-            input_requests[0].multi_modal_data
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
    assert test_mm_content is None or isinstance(test_mm_content, dict)
    test_input = RequestFuncInput(
@@ -292,36 +320,36 @@ async def benchmark(
    if not test_output.success:
        raise ValueError(
            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}")
+            f"are correctly specified. Error: {test_output.error}"
+        )
    else:
        print("Initial test run completed. Starting main benchmark run...")
    if lora_modules:
        # For each input request, choose a LoRA module at random.
        lora_modules = iter(
-            [random.choice(lora_modules) \
+            [random.choice(lora_modules) for _ in range(len(input_requests))]
-                for _ in range(len(input_requests))])
+        )
    if profile:
        print("Starting profiler...")
-        profile_input = RequestFuncInput(model=model_id,
+        profile_input = RequestFuncInput(
-                                         model_name=model_name,
+            model=model_id,
-                                         prompt=test_prompt,
+            model_name=model_name,
-                                         api_url=base_url + "/start_profile",
+            prompt=test_prompt,
-                                         prompt_len=test_prompt_len,
+            api_url=base_url + "/start_profile",
-                                         output_len=test_output_len,
+            prompt_len=test_prompt_len,
-                                         logprobs=logprobs,
+            output_len=test_output_len,
-                                         multi_modal_content=test_mm_content,
+            logprobs=logprobs,
-                                         ignore_eos=ignore_eos,
+            multi_modal_content=test_mm_content,
-                                         extra_body=extra_body)
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
-    if burstiness == 1.0:
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-        distribution = "Poisson process"
-    else:
-        distribution = "Gamma distribution"
    print(f"Traffic request rate: {request_rate}")
    print(f"Burstiness factor: {burstiness} ({distribution})")
@@ -333,42 +361,45 @@ async def benchmark(
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = (asyncio.Semaphore(max_concurrency)
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-                 if max_concurrency else None)
    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
-            return await request_func(request_func_input=request_func_input,
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
-                                      pbar=pbar)
        async with semaphore:
-            return await request_func(request_func_input=request_func_input,
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
-                                      pbar=pbar)
    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request.prompt, \
+        prompt, prompt_len, output_len, mm_content = (
-            request.prompt_len, request.expected_output_len, \
+            request.prompt,
-                request.multi_modal_data
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+        )
        req_model_id, req_model_name = model_id, model_name
        if lora_modules:
            req_lora_module = next(lora_modules)
            req_model_id, req_model_name = req_lora_module, req_lora_module
-        request_func_input = RequestFuncInput(model=req_model_id,
+        request_func_input = RequestFuncInput(
-                                              model_name=req_model_name,
+            model=req_model_id,
-                                              prompt=prompt,
+            model_name=req_model_name,
-                                              api_url=api_url,
+            prompt=prompt,
-                                              prompt_len=prompt_len,
+            api_url=api_url,
-                                              output_len=output_len,
+            prompt_len=prompt_len,
-                                              logprobs=logprobs,
+            output_len=output_len,
-                                              multi_modal_content=mm_content,
+            logprobs=logprobs,
-                                              ignore_eos=ignore_eos,
+            multi_modal_content=mm_content,
-                                              extra_body=extra_body)
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
        tasks.append(
            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input,
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
-                                     pbar=pbar)))
+            )
+        )
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
    if profile:
@@ -400,22 +431,32 @@ async def benchmark(
        goodput_config_dict=goodput_config_dict,
    )
-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
-                                    benchmark_duration))
    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
-                                 metrics.total_output))
+    print(
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+        "{:<40} {:<10.2f}".format(
-                                    metrics.request_throughput))
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
    if goodput_config_dict:
-        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+        print(
-                                        metrics.request_goodput))
+            "{:<40} {:<10.2f}".format(
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                "Request goodput (req/s):", metrics.request_goodput
-                                    metrics.output_throughput))
+            )
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+        )
-                                    metrics.total_token_throughput))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
    result = {
        "duration": benchmark_duration,
@@ -423,8 +464,7 @@ async def benchmark(
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
-        "request_goodput:":
+        "request_goodput:": metrics.request_goodput if goodput_config_dict else None,
-        metrics.request_goodput if goodput_config_dict else None,
        "output_throughput": metrics.output_throughput,
        "total_token_throughput": metrics.total_token_throughput,
        "input_lens": [output.prompt_len for output in outputs],
@@ -447,29 +487,35 @@ async def benchmark(
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
-        print("{:<40} {:<10.2f}".format(
+        print(
-            f"Mean {metric_name} (ms):",
+            "{:<40} {:<10.2f}".format(
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+                f"Mean {metric_name} (ms):",
-        print("{:<40} {:<10.2f}".format(
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
-            f"Median {metric_name} (ms):",
+            )
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
+            metrics, f"std_{metric_attribute_name}_ms"
-        for p, value in getattr(metrics,
+        )
-                                f"percentiles_{metric_attribute_name}_ms"):
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
-                                            value))
            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
-                       "Time per Output Token (excl. 1st token)")
    process_one_metric("itl", "ITL", "Inter-token Latency")
    process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -489,12 +535,14 @@ def check_goodput_args(args):
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. ")
+                    f"{str(VALID_NAMES)}. "
+                )
            if slo_val < 0:
                raise ValueError(
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
-                    "non-negative.")
+                    "non-negative."
+                )
    return goodput_config_dict
@@ -507,31 +555,42 @@ def parse_goodput(slo_pairs):
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
-            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds.") from err
+            "number in milliseconds."
+        ) from err
    return goodput_config_dict
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+def save_to_pytorch_benchmark_format(
-                                     results: dict[str, Any],
+    args: argparse.Namespace, results: dict[str, Any], file_name: str
-                                     file_name: str) -> None:
+) -> None:
    metrics = [
-        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
+        "median_ttft_ms",
-        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
+        "mean_ttft_ms",
-        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
    ]
    # These raw data might be useful, but they are rather big. They can be added
    # later if needed
    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
-        metrics={k: [results[k]]
+        metrics={k: [results[k]] for k in metrics},
-                 for k in metrics},
        extra_info={
            k: results[k]
-            for k in results if k not in metrics and k not in ignored_metrics
+            for k in results
-        })
+            if k not in metrics and k not in ignored_metrics
+        },
+    )
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
@@ -556,34 +615,42 @@ def main(args: argparse.Namespace):
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"
-    tokenizer = get_tokenizer(tokenizer_id,
+    tokenizer = get_tokenizer(
-                              tokenizer_mode=tokenizer_mode,
+        tokenizer_id,
-                              trust_remote_code=args.trust_remote_code)
+        tokenizer_mode=tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )
    if args.dataset_name is None:
        raise ValueError(
            "Please specify '--dataset-name' and the corresponding "
-            "'--dataset-path' if required.")
+            "'--dataset-path' if required."
+        )
    if args.dataset_name == "sonnet":
        dataset = SonnetDataset(dataset_path=args.dataset_path)
        # For the "sonnet" dataset, formatting depends on the backend.
        if args.backend == "openai-chat":
-            input_requests = dataset.sample(num_requests=args.num_prompts,
+            input_requests = dataset.sample(
-                                            input_len=args.sonnet_input_len,
+                num_requests=args.num_prompts,
-                                            output_len=args.sonnet_output_len,
+                input_len=args.sonnet_input_len,
-                                            prefix_len=args.sonnet_prefix_len,
+                output_len=args.sonnet_output_len,
-                                            tokenizer=tokenizer,
+                prefix_len=args.sonnet_prefix_len,
-                                            return_prompt_formatted=False)
+                tokenizer=tokenizer,
+                return_prompt_formatted=False,
+            )
        else:
            assert tokenizer.chat_template or tokenizer.default_chat_template, (
-                "Tokenizer/model must have chat template for sonnet dataset.")
+                "Tokenizer/model must have chat template for sonnet dataset."
-            input_requests = dataset.sample(num_requests=args.num_prompts,
+            )
-                                            input_len=args.sonnet_input_len,
+            input_requests = dataset.sample(
-                                            output_len=args.sonnet_output_len,
+                num_requests=args.num_prompts,
-                                            prefix_len=args.sonnet_prefix_len,
+                input_len=args.sonnet_input_len,
-                                            tokenizer=tokenizer,
+                output_len=args.sonnet_output_len,
-                                            return_prompt_formatted=True)
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=True,
+            )
    elif args.dataset_name == "hf":
        # all following datasets are implemented from the
@@ -595,32 +662,45 @@ def main(args: argparse.Namespace):
        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = InstructCoderDataset
            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = ConversationDataset
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
            dataset_class = AIMODataset
            args.hf_split = "train"
+        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+            dataset_class = NextEditPredictionDataset
+            args.hf_split = "train"
        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = ASRDataset
            args.hf_split = "train"
        else:
-            supported_datasets = set([
+            supported_datasets = set(
-                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                [
-                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+                    dataset_name
-            ])
+                    for cls in HuggingFaceDataset.__subclasses__()
+                    for dataset_name in cls.SUPPORTED_DATASET_PATHS
+                ]
+            )
            raise ValueError(
                f"Unsupported dataset path: {args.dataset_path}. "
                "Huggingface dataset only supports dataset_path"
                f" from one of following: {supported_datasets}. "
                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
+                "like to add support for additional dataset formats."
+            )
-        if (dataset_class.IS_MULTIMODAL and backend not in \
+        if dataset_class.IS_MULTIMODAL and backend not in [
-            ["openai-chat", "openai-audio"]):
+            "openai-chat",
+            "openai-audio",
+        ]:
            # multi-modal benchmark is only available on OpenAI Chat backend.
            raise ValueError(
-                "Multi-modal content is only supported on 'openai-chat' and " \
+                "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' backend.")
+                "'openai-audio' backend."
+            )
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
@@ -635,26 +715,24 @@ def main(args: argparse.Namespace):
    else:
        # For datasets that follow a similar structure, use a mapping.
        dataset_mapping = {
-            "sharegpt":
+            "sharegpt": lambda: ShareGPTDataset(
-            lambda: ShareGPTDataset(random_seed=args.seed,
+                random_seed=args.seed, dataset_path=args.dataset_path
-                                    dataset_path=args.dataset_path).sample(
+            ).sample(
-                                        tokenizer=tokenizer,
+                tokenizer=tokenizer,
-                                        num_requests=args.num_prompts,
+                num_requests=args.num_prompts,
-                                        output_len=args.sharegpt_output_len,
+                output_len=args.sharegpt_output_len,
-                                    ),
+            ),
-            "burstgpt":
+            "burstgpt": lambda: BurstGPTDataset(
-            lambda: BurstGPTDataset(random_seed=args.seed,
+                random_seed=args.seed, dataset_path=args.dataset_path
-                                    dataset_path=args.dataset_path).
+            ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
-            "random":
-            lambda: RandomDataset(dataset_path=args.dataset_path).sample(
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
                prefix_len=args.random_prefix_len,
                input_len=args.random_input_len,
                output_len=args.random_output_len,
                range_ratio=args.random_range_ratio,
-            )
+            ),
        }
        try:
@@ -670,15 +748,16 @@ def main(args: argparse.Namespace):
            "top_p": args.top_p,
            "top_k": args.top_k,
            "min_p": args.min_p,
-            "temperature": args.temperature
+            "temperature": args.temperature,
-        }.items() if v is not None
+        }.items()
+        if v is not None
    }
    # Sampling parameters are only supported by openai-compatible backend.
    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
        raise ValueError(
-            "Sampling parameters are only supported by openai-compatible "
+            "Sampling parameters are only supported by openai-compatible backends."
-            "backends.")
+        )
    if "temperature" not in sampling_params:
        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
@@ -702,15 +781,14 @@ def main(args: argparse.Namespace):
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
            ignore_eos=args.ignore_eos,
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
-        ))
+        )
+    )
    # Save config and results to json
    if args.save_result or args.append_result:
@@ -735,8 +813,9 @@ def main(args: argparse.Namespace):
                        "Invalid metadata format. Please use KEY=VALUE format."
                    )
        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
+        result_json["request_rate"] = (
-                                       < float("inf") else "inf")
+            args.request_rate if args.request_rate < float("inf") else "inf"
+        )
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency
@@ -746,24 +825,31 @@ def main(args: argparse.Namespace):
        if not args.save_detailed:
            # Remove fields with too many data points
            for field in [
-                    "input_lens", "output_lens", "ttfts", "itls",
+                "input_lens",
-                    "generated_texts", "errors"
+                "output_lens",
+                "ttfts",
+                "itls",
+                "generated_texts",
+                "errors",
            ]:
                if field in result_json:
                    del result_json[field]
        # Save to file
        base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+        max_concurrency_str = (
-                               if args.max_concurrency is not None else "")
+            f"-concurrency{args.max_concurrency}"
-        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  #noqa
+            if args.max_concurrency is not None
+            else ""
+        )
+        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name,
+        with open(
-                  mode="a+" if args.append_result else "w",
+            file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
-                  encoding='utf-8') as outfile:
+        ) as outfile:
            # Append a newline.
            if args.append_result and outfile.tell() != 0:
                outfile.write("\n")
@@ -773,7 +859,8 @@ def main(args: argparse.Namespace):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description="Benchmark the online serving throughput.")
+        description="Benchmark the online serving throughput."
+    )
    parser.add_argument(
        "--backend",
        type=str,
@@ -802,11 +889,13 @@ if __name__ == "__main__":
        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
        help="Name of the dataset to benchmark on.",
    )
-    parser.add_argument("--dataset-path",
+    parser.add_argument(
-                        type=str,
+        "--dataset-path",
-                        default=None,
+        type=str,
-                        help="Path to the sharegpt/sonnet dataset. "
+        default=None,
-                        "Or the huggingface dataset ID if using HF dataset.")
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
    parser.add_argument(
        "--max-concurrency",
        type=int,
@@ -818,7 +907,8 @@ if __name__ == "__main__":
        "initiated, this argument will control how many are actually allowed "
        "to execute at a time. This means that when used in combination, the "
        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )
    parser.add_argument(
        "--model",
@@ -829,8 +919,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help=
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
@@ -843,11 +932,13 @@ if __name__ == "__main__":
        "--logprobs",
        type=int,
        default=None,
-        help=("Number of logprobs-per-token to compute & return as part of "
+        help=(
-              "the request. If unspecified, then either (1) if beam search "
+            "Number of logprobs-per-token to compute & return as part of "
-              "is disabled, no logprobs are computed & a single dummy "
+            "the request. If unspecified, then either (1) if beam search "
-              "logprob is returned for each token; or (2) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
-              "is enabled 1 logprob per token is computed"),
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
    )
    parser.add_argument(
        "--request-rate",
@@ -931,35 +1022,38 @@ if __name__ == "__main__":
        "--ignore-eos",
        action="store_true",
        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
    parser.add_argument(
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
-        "Default value is \"ttft,tpot,itl\".")
+        'Default value is "ttft,tpot,itl".',
+    )
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
        help="Comma-separated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
-        "Default value is \"99\". "
+        'Default value is "99". '
-        "Use \"--percentile-metrics\" to select metrics.",
+        'Use "--percentile-metrics" to select metrics.',
    )
    parser.add_argument(
        "--goodput",
        nargs="+",
        required=False,
-        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
        "pairs, where the key is a metric name, and the value is in "
-        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
        "separated by spaces. Allowed request level metric names are "
-        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
    # group for dataset specific arguments
    sonnet_group = parser.add_argument_group("sonnet dataset options")
@@ -967,22 +1061,19 @@ if __name__ == "__main__":
        "--sonnet-input-len",
        type=int,
        default=550,
-        help=
+        help="Number of input tokens per request, used only for sonnet dataset.",
-        "Number of input tokens per request, used only for sonnet dataset.",
    )
    sonnet_group.add_argument(
        "--sonnet-output-len",
        type=int,
        default=150,
-        help=
+        help="Number of output tokens per request, used only for sonnet dataset.",
-        "Number of output tokens per request, used only for sonnet dataset.",
    )
    sonnet_group.add_argument(
        "--sonnet-prefix-len",
        type=int,
        default=200,
-        help=
+        help="Number of prefix tokens per request, used only for sonnet dataset.",
-        "Number of prefix tokens per request, used only for sonnet dataset.",
    )
    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
@@ -991,22 +1082,21 @@ if __name__ == "__main__":
        type=int,
        default=None,
        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.")
+        "from the ShareGPT dataset.",
+    )
    random_group = parser.add_argument_group("random dataset options")
    random_group.add_argument(
        "--random-input-len",
        type=int,
        default=1024,
-        help=
+        help="Number of input tokens per request, used only for random sampling.",
-        "Number of input tokens per request, used only for random sampling.",
    )
    random_group.add_argument(
        "--random-output-len",
        type=int,
        default=128,
-        help=
+        help="Number of output tokens per request, used only for random sampling.",
-        "Number of output tokens per request, used only for random sampling.",
    )
    random_group.add_argument(
        "--random-range-ratio",
@@ -1021,23 +1111,23 @@ if __name__ == "__main__":
        "--random-prefix-len",
        type=int,
        default=0,
-        help=("Number of fixed prefix tokens before the random context "
+        help=(
-              "in a request. "
+            "Number of fixed prefix tokens before the random context "
-              "The total input length is the sum of `random-prefix-len` and "
+            "in a request. "
-              "a random "
+            "The total input length is the sum of `random-prefix-len` and "
-              "context length sampled from [input_len * (1 - range_ratio), "
+            "a random "
-              "input_len * (1 + range_ratio)]."),
+            "context length sampled from [input_len * (1 - range_ratio), "
+            "input_len * (1 + range_ratio)]."
+        ),
    )
    hf_group = parser.add_argument_group("hf dataset options")
-    hf_group.add_argument("--hf-subset",
+    hf_group.add_argument(
-                          type=str,
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
-                          default=None,
+    )
-                          help="Subset of the HF dataset.")
+    hf_group.add_argument(
-    hf_group.add_argument("--hf-split",
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
-                          type=str,
+    )
-                          default=None,
-                          help="Split of the HF dataset.")
    hf_group.add_argument(
        "--hf-output-len",
        type=int,
@@ -1051,52 +1141,58 @@ if __name__ == "__main__":
        "--top-p",
        type=float,
        default=None,
-        help="Top-p sampling parameter. Only has effect on openai-compatible "
+        help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
-        "backends.")
+    )
    sampling_group.add_argument(
        "--top-k",
        type=int,
        default=None,
-        help="Top-k sampling parameter. Only has effect on openai-compatible "
+        help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
-        "backends.")
+    )
    sampling_group.add_argument(
        "--min-p",
        type=float,
        default=None,
-        help="Min-p sampling parameter. Only has effect on openai-compatible "
+        help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
-        "backends.")
+    )
    sampling_group.add_argument(
        "--temperature",
        type=float,
        default=None,
        help="Temperature sampling parameter. Only has effect on "
        "openai-compatible backends. If not specified, default to greedy "
-        "decoding (i.e. temperature==0.0).")
+        "decoding (i.e. temperature==0.0).",
+    )
    parser.add_argument(
-        '--tokenizer-mode',
+        "--tokenizer-mode",
        type=str,
        default="auto",
-        choices=['auto', 'slow', 'mistral', 'custom'],
+        choices=["auto", "slow", "mistral", "custom"],
        help='The tokenizer mode.\n\n* "auto" will use the '
        'fast tokenizer if available.\n* "slow" will '
-        'always use the slow tokenizer. \n* '
+        "always use the slow tokenizer. \n* "
        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.')
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )
-    parser.add_argument("--served-model-name",
-                        type=str,
+    parser.add_argument(
-                        default=None,
+        "--served-model-name",
-                        help="The model name used in the API. "
+        type=str,
-                        "If not specified, the model name will be the "
+        default=None,
-                        "same as the ``--model`` argument. ")
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
-    parser.add_argument("--lora-modules",
+        "same as the ``--model`` argument. ",
-                        nargs='+',
+    )
-                        default=None,
-                        help="A subset of LoRA module names passed in when "
+    parser.add_argument(
-                        "launching the server. For each request, the "
+        "--lora-modules",
-                        "script chooses a LoRA module at random.")
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )
    args = parser.parse_args()

--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -19,6 +19,7 @@ On the client side, run:
        --endpoint /generate_stream
    to the end of the command above.
 """
 import argparse
 import asyncio
 import copy
@@ -36,11 +37,15 @@ from typing import Optional
 import datasets
 import numpy as np
 import pandas as pd
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
-                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
@@ -52,7 +57,8 @@ except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
 from vllm.v1.structured_output.backend_xgrammar import (
-    has_xgrammar_unsupported_json_features)
+    has_xgrammar_unsupported_json_features,
+)
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -98,6 +104,7 @@ class SampleRequest:
        prompt_len: The length of the prompt in tokens.
        expected_output_len: The expected length of the output in tokens.
    """
    prompt: str
    prompt_len: int
    expected_output_len: int
@@ -106,45 +113,45 @@ class SampleRequest:
    completion: str = None
-def sample_requests(tokenizer: PreTrainedTokenizerBase,
+def sample_requests(
-                    args: argparse.Namespace) -> list[SampleRequest]:
+    tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
-    if args.dataset == 'json' or args.dataset == 'json-unique':
+) -> list[SampleRequest]:
+    if args.dataset == "json" or args.dataset == "json-unique":
        if args.json_schema_path is None:
            dir_path = os.path.dirname(os.path.realpath(__file__))
-            args.json_schema_path = os.path.join(dir_path,
+            args.json_schema_path = os.path.join(
-                                                 "structured_schemas",
+                dir_path, "structured_schemas", "structured_schema_1.json"
-                                                 "structured_schema_1.json")
+            )
        json_schemas = []
        with open(args.json_schema_path) as f:
            schema = json.load(f)
-        if args.dataset == 'json-unique':
+        if args.dataset == "json-unique":
-            json_schemas = [
+            json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
-                copy.deepcopy(schema) for _ in range(args.num_prompts)
-            ]
            for i in range(len(json_schemas)):
-                json_schemas[i]["properties"][
+                if "properties" not in json_schemas[i]:
-                    f"__optional_field_{uuid.uuid4()}"] = {
+                    json_schemas[i]["properties"] = {}
-                        "type":
+                json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
-                        "string",
+                    "type": "string",
-                        "description":
+                    "description": "An unique optional field to avoid cached schemas",
-                        "An unique optional field to avoid cached schemas"
+                }
-                    }
        else:
            json_schemas = [schema] * args.num_prompts
        def gen_prompt(index: int):
-            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]
        requests = [
-            SampleRequest(prompt=gen_prompt(i),
+            SampleRequest(
-                          prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
+                prompt=gen_prompt(i),
-                          expected_output_len=args.output_len,
+                prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
-                          schema=get_schema(i),
+                expected_output_len=args.output_len,
-                          structure_type=args.structure_type)
+                schema=get_schema(i),
+                structure_type=args.structure_type,
+            )
            for i in range(args.num_prompts)
        ]
@@ -168,11 +175,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
-            SampleRequest(prompt=prompt,
+            SampleRequest(
-                          prompt_len=input_len,
+                prompt=prompt,
-                          expected_output_len=args.output_len,
+                prompt_len=input_len,
-                          schema=schema,
+                expected_output_len=args.output_len,
-                          structure_type=args.structure_type)
+                schema=schema,
+                structure_type=args.structure_type,
+            )
            for _ in range(args.num_prompts)
        ]
@@ -186,11 +195,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
-            SampleRequest(prompt=prompt,
+            SampleRequest(
-                          prompt_len=input_len,
+                prompt=prompt,
-                          expected_output_len=args.output_len,
+                prompt_len=input_len,
-                          schema=regex,
+                expected_output_len=args.output_len,
-                          structure_type=args.structure_type)
+                schema=regex,
+                structure_type=args.structure_type,
+            )
            for _ in range(args.num_prompts)
        ]
@@ -201,47 +212,55 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
        input_len = len(tokenizer(prompt).input_ids)
        print(f"Input length of the prompt: {input_len} tokens")
        requests = [
-            SampleRequest(prompt=prompt,
+            SampleRequest(
-                          prompt_len=input_len,
+                prompt=prompt,
-                          expected_output_len=args.output_len,
+                prompt_len=input_len,
-                          schema=choice,
+                expected_output_len=args.output_len,
-                          structure_type=args.structure_type)
+                schema=choice,
+                structure_type=args.structure_type,
+            )
            for _ in range(args.num_prompts)
        ]
    elif args.dataset == "xgrammar_bench":
        requests: list[SampleRequest] = []
-        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
-                                        split="train")
        full_dataset_len = len(dataset)
        def _filter_func(item):
            import json
            schema = json.loads(item["schema"])
            return not has_xgrammar_unsupported_json_features(schema)
        dataset = dataset.filter(_filter_func)
        num_filtered_out = full_dataset_len - len(dataset)
-        print(f"dataset has {len(dataset)} entries after filtering "
+        print(
-              f"out {num_filtered_out} entries with unsupported features")
+            f"dataset has {len(dataset)} entries after filtering "
+            f"out {num_filtered_out} entries with unsupported features"
+        )
        len_dataset = len(dataset)
        for data_point_idx in range(args.num_prompts):
            idx = data_point_idx
            while idx >= len_dataset:
                idx -= len_dataset
            schema = dataset["schema"][idx]
-            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+            prompt = tokenizer.apply_chat_template(
-                                                   tokenize=False)
+                dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
+            )
            input_len = len(tokenizer(prompt).input_ids)
            completion = dataset["completion"][idx]
            requests.append(
-                SampleRequest(prompt=prompt,
+                SampleRequest(
-                              prompt_len=input_len,
+                    prompt=prompt,
-                              expected_output_len=args.output_len,
+                    prompt_len=input_len,
-                              schema=schema,
+                    expected_output_len=args.output_len,
-                              structure_type=args.structure_type,
+                    schema=schema,
-                              completion=completion))
+                    structure_type=args.structure_type,
+                    completion=completion,
+                )
+            )
    return requests
@@ -273,7 +292,8 @@ async def get_request(
    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}.")
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
    theta = 1.0 / (request_rate * burstiness)
    for i, request in enumerate(input_requests):
@@ -315,8 +335,8 @@ def calculate_metrics(
            # multiple output tokens may be bundled together
            # Note : this may inflate the output token count slightly
            output_len = len(
-                tokenizer(outputs[i].generated_text,
+                tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
-                          add_special_tokens=False).input_ids)
+            )
            actual_output_lens.append(output_len)
            total_input += input_requests[i].prompt_len
            tpot = 0
@@ -340,16 +360,19 @@ def calculate_metrics(
        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
-            slo_values.append(goodput_config_dict["ttft"] /
+            slo_values.append(
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
-            slo_values.append(goodput_config_dict["tpot"] /
+            slo_values.append(
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
-            slo_values.append(goodput_config_dict["e2el"] /
+            slo_values.append(
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        for req_metric in zip(*valid_metrics):
            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -360,7 +383,8 @@ def calculate_metrics(
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
            "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
@@ -369,27 +393,31 @@ def calculate_metrics(
        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0) *
+        mean_ttft_ms=np.mean(ttfts or 0)
-        1000,  # ttfts is empty if streaming is not supported by backend
+        * 1000,  # ttfts is empty if streaming is not supported by backend
        std_ttft_ms=np.std(ttfts or 0) * 1000,
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+        percentiles_ttft_ms=[
-                             for p in selected_percentiles],
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        std_tpot_ms=np.std(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+        percentiles_tpot_ms=[
-                             for p in selected_percentiles],
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_itl_ms=np.mean(itls or 0) * 1000,
        std_itl_ms=np.std(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+        percentiles_itl_ms=[
-                            for p in selected_percentiles],
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+        percentiles_e2el_ms=[
-                             for p in selected_percentiles],
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
    )
    return metrics, actual_output_lens
@@ -411,7 +439,6 @@ async def benchmark(
    ignore_eos: bool,
    max_concurrency: Optional[int],
    structured_output_ratio: float,
-    structured_output_backend: str,
    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
@@ -423,18 +450,17 @@ async def benchmark(
        extra_body = {}
        # Add the schema to the extra_body
        extra_body[request.structure_type] = request.schema
-        # Add the specific structured_output_backend
-        extra_body["guided_decoding_backend"] = structured_output_backend
        return extra_body
    print("Starting initial single prompt test run...")
    structured_output_req_idx = random.sample(
-        range(len(input_requests)),
+        range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
-        int(len(input_requests) * structured_output_ratio))
+    )
    test_request = input_requests[0]
-    test_req_extra_body = (prepare_extra_body(test_request)
+    test_req_extra_body = (
-                           if 0 in structured_output_req_idx else None)
+        prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
+    )
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_request.prompt,
@@ -448,7 +474,8 @@ async def benchmark(
    if not test_output.success:
        raise ValueError(
            "Initial test run failed - Please make sure benchmark arguments "
-            f"are correctly specified. Error: {test_output.error}")
+            f"are correctly specified. Error: {test_output.error}"
+        )
    else:
        print("Initial test run completed. Starting main benchmark run...")
@@ -467,10 +494,7 @@ async def benchmark(
        if profile_output.success:
            print("Profiler started")
-    if burstiness == 1.0:
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
-        distribution = "Poisson process"
-    else:
-        distribution = "Gamma distribution"
    print(f"Traffic request rate: {request_rate}")
    print(f"Burstiness factor: {burstiness} ({distribution})")
@@ -482,24 +506,21 @@ async def benchmark(
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = (asyncio.Semaphore(max_concurrency)
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
-                 if max_concurrency else None)
    async def limited_request_func(request_func_input, pbar):
        if semaphore is None:
-            return await request_func(request_func_input=request_func_input,
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
-                                      pbar=pbar)
        async with semaphore:
-            return await request_func(request_func_input=request_func_input,
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
-                                      pbar=pbar)
    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
    expected: list[str] = []
-    async for i, request in get_request(input_requests, request_rate,
+    async for i, request in get_request(input_requests, request_rate, burstiness):
-                                        burstiness):
+        extra_body = (
-        extra_body = prepare_extra_body(
+            prepare_extra_body(request) if i in structured_output_req_idx else None
-            request) if i in structured_output_req_idx else None
+        )
        request_func_input = RequestFuncInput(
            model=model_id,
            prompt=request.prompt,
@@ -512,8 +533,9 @@ async def benchmark(
        expected.append(request.completion)
        tasks.append(
            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input,
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
-                                     pbar=pbar)))
+            )
+        )
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
    if profile:
@@ -545,54 +567,58 @@ async def benchmark(
        goodput_config_dict=goodput_config_dict,
    )
-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
-                                    benchmark_duration))
    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
-                                 metrics.total_output))
+    print(
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+        "{:<40} {:<10.2f}".format(
-                                    metrics.request_throughput))
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
    if goodput_config_dict:
-        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+        print(
-                                        metrics.request_goodput))
+            "{:<40} {:<10.2f}".format(
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                "Request goodput (req/s):", metrics.request_goodput
-                                    metrics.output_throughput))
+            )
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+        )
-                                    metrics.total_token_throughput))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
    result = {
-        "duration":
+        "duration": benchmark_duration,
-        benchmark_duration,
+        "completed": metrics.completed,
-        "completed":
+        "total_input_tokens": metrics.total_input,
-        metrics.completed,
+        "total_output_tokens": metrics.total_output,
-        "total_input_tokens":
+        "request_throughput": metrics.request_throughput,
-        metrics.total_input,
+        "output_throughput": metrics.output_throughput,
-        "total_output_tokens":
+        "total_token_throughput": metrics.total_token_throughput,
-        metrics.total_output,
+        "ttft_description": pd.Series([output.ttft for output in outputs])
-        "request_throughput":
+        .describe()
-        metrics.request_throughput,
+        .to_dict(),
-        "output_throughput":
+        "tpot_description": pd.Series([output.tpot for output in outputs])
-        metrics.output_throughput,
+        .describe()
-        "total_token_throughput":
+        .to_dict(),
-        metrics.total_token_throughput,
-        "ttft_description":
-        pd.Series([output.ttft for output in outputs]).describe().to_dict(),
-        "tpot_description":
-        pd.Series([output.tpot for output in outputs]).describe().to_dict(),
        "input_lens": [output.prompt_len for output in outputs],
-        "output_lens":
+        "output_lens": actual_output_lens,
-        actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
        "itls": [output.itl for output in outputs],
        "errors": [output.error for output in outputs],
    }
-    ret = [{
+    ret = [
-        'generated': output.generated_text,
+        {"generated": output.generated_text, "expected": gt}
-        'expected': gt
+        for output, gt in zip(outputs, expected)
-    } for output, gt in zip(outputs, expected)]
+    ]
    def process_one_metric(
        # E.g., "ttft"
@@ -606,29 +632,35 @@ async def benchmark(
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
-        print("{:<40} {:<10.2f}".format(
+        print(
-            f"Mean {metric_name} (ms):",
+            "{:<40} {:<10.2f}".format(
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+                f"Mean {metric_name} (ms):",
-        print("{:<40} {:<10.2f}".format(
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
-            f"Median {metric_name} (ms):",
+            )
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
+            metrics, f"std_{metric_attribute_name}_ms"
-        for p, value in getattr(metrics,
+        )
-                                f"percentiles_{metric_attribute_name}_ms"):
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
-                                            value))
            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
    process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
-                       "Time per Output Token (excl. 1st token)")
    process_one_metric("itl", "ITL", "Inter-token Latency")
    process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -638,13 +670,13 @@ async def benchmark(
 def evaluate(ret, args):
    def _eval_correctness_json(expected, actual):
        # extract json string from string using regex
        import re
-        actual = actual.replace('\n', '').replace(' ', '').strip()
+        actual = actual.replace("\n", "").replace(" ", "").strip()
        try:
-            actual = re.search(r'\{.*\}', actual).group()
+            actual = re.search(r"\{.*\}", actual).group()
            actual = json.loads(actual)
        except Exception:
            return False
@@ -656,28 +688,32 @@ def evaluate(ret, args):
    def _eval_correctness_regex(expected, actual):
        import re
        return re.match(args.regex, actual) is not None
    def _eval_correctness(expected, actual):
-        if args.structure_type == 'guided_json':
+        if args.structure_type == "guided_json":
            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == 'guided_regex':
+        elif args.structure_type == "guided_regex":
            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == 'guided_choice':
+        elif args.structure_type == "guided_choice":
            return _eval_correctness_choice(expected, actual)
        else:
            return None
    scores = []
    for res in ret:
-        score = _eval_correctness(res['expected'], res['generated'])
+        score = _eval_correctness(res["expected"], res["generated"])
-        res['correctness'] = score
+        res["correctness"] = score
        scores.append(score)
    not_none_scores = [score for score in scores if score is not None]
-    return (sum(not_none_scores) / len(not_none_scores) *
+    return (
-            100) if len(not_none_scores) > 0 else None
+        (sum(not_none_scores) / len(not_none_scores) * 100)
+        if len(not_none_scores) > 0
+        else None
+    )
 def parse_goodput(slo_pairs):
@@ -689,9 +725,10 @@ def parse_goodput(slo_pairs):
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
-            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds.") from err
+            "number in milliseconds."
+        ) from err
    return goodput_config_dict
@@ -705,12 +742,14 @@ def check_goodput_args(args):
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. ")
+                    f"{str(VALID_NAMES)}. "
+                )
            if slo_val < 0:
                raise ValueError(
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
-                    "non-negative.")
+                    "non-negative."
+                )
    return goodput_config_dict
@@ -736,19 +775,19 @@ def main(args: argparse.Namespace):
        tokenizer_mode=args.tokenizer_mode,
    )
-    if args.dataset == 'grammar':
+    if args.dataset == "grammar":
-        args.structure_type = 'guided_grammar'
+        args.structure_type = "guided_grammar"
-    elif args.dataset == 'regex':
+    elif args.dataset == "regex":
-        args.structure_type = 'guided_regex'
+        args.structure_type = "guided_regex"
-    elif args.dataset == 'choice':
+    elif args.dataset == "choice":
-        args.structure_type = 'guided_choice'
+        args.structure_type = "guided_choice"
    else:
-        args.structure_type = 'guided_json'
+        args.structure_type = "guided_json"
    if args.no_structured_output:
        args.structured_output_ratio = 0
    if args.save_results:
-        result_file_name = f'{args.structured_output_ratio}guided'
+        result_file_name = f"{args.structured_output_ratio}guided"
        result_file_name += f"_{backend}"
        result_file_name += f"_{args.request_rate}qps"
        result_file_name += f"_{args.model.split('/')[-1]}"
@@ -776,37 +815,29 @@ def main(args: argparse.Namespace):
            disable_tqdm=args.disable_tqdm,
            profile=args.profile,
            selected_percentile_metrics=args.percentile_metrics.split(","),
-            selected_percentiles=[
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
-                float(p) for p in args.metric_percentiles.split(",")
-            ],
            ignore_eos=args.ignore_eos,
            max_concurrency=args.max_concurrency,
            structured_output_ratio=args.structured_output_ratio,
-            structured_output_backend=args.structured_output_backend,
            goodput_config_dict=goodput_config_dict,
-        ))
+        )
+    )
    # Save config and results to json
    score = evaluate(ret, args)
-    print("correct_rate(%)", score, '\n')
+    print("correct_rate(%)", score, "\n")
    if args.save_results:
        results = {
-            "backend":
+            "backend": backend,
-            backend,
+            "model_id": model_id,
-            "model_id":
+            "tokenizer_id": tokenizer_id,
-            model_id,
+            "num_prompts": args.num_prompts,
-            "tokenizer_id":
+            "request_rate": args.request_rate
-            tokenizer_id,
+            if args.request_rate < float("inf")
-            "num_prompts":
+            else "inf",
-            args.num_prompts,
+            "burstiness": args.burstiness,
-            "request_rate":
+            "max_concurrency": args.max_concurrency,
-            args.request_rate if args.request_rate < float("inf") else "inf",
+            "correct_rate(%)": score,
-            "burstiness":
-            args.burstiness,
-            "max_concurrency":
-            args.max_concurrency,
-            "correct_rate(%)":
-            score
        }
        results = {"outputs": ret, **results, **benchmark_result}
@@ -815,13 +846,14 @@ def main(args: argparse.Namespace):
            result_file_name = args.result_filename
        if args.result_dir:
            result_file_name = os.path.join(args.result_dir, result_file_name)
-        with open(result_file_name, "w", encoding='utf-8') as outfile:
+        with open(result_file_name, "w", encoding="utf-8") as outfile:
            json.dump(results, outfile, indent=4)
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
-        description="Benchmark the online serving throughput.")
+        description="Benchmark the online serving throughput."
+    )
    parser.add_argument(
        "--backend",
        type=str,
@@ -843,16 +875,14 @@ if __name__ == "__main__":
        default="/v1/completions",
        help="API endpoint.",
    )
-    parser.add_argument("--dataset",
+    parser.add_argument(
-                        default='json',
+        "--dataset",
-                        choices=[
+        default="json",
-                            'json', 'json-unique', 'grammar', 'regex',
+        choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
-                            'choice', 'xgrammar_bench'
+    )
-                        ])
+    parser.add_argument(
-    parser.add_argument("--json_schema_path",
+        "--json-schema-path", type=str, default=None, help="Path to json schema."
-                        type=str,
+    )
-                        default=None,
-                        help="Path to json schema.")
    parser.add_argument(
        "--max-concurrency",
        type=int,
@@ -864,7 +894,8 @@ if __name__ == "__main__":
        "initiated, this argument will control how many are actually allowed "
        "to execute at a time. This means that when used in combination, the "
        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )
    parser.add_argument(
        "--model",
        type=str,
@@ -874,15 +905,13 @@ if __name__ == "__main__":
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help=
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--tokenizer-mode",
        type=str,
        default="auto",
-        help=
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--num-prompts",
@@ -959,52 +988,51 @@ if __name__ == "__main__":
        "--ignore-eos",
        action="store_true",
        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
    parser.add_argument(
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
-        "Default value is \"ttft,tpot,itl\".")
+        'Default value is "ttft,tpot,itl".',
+    )
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
        help="Comma-separated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
-        "Default value is \"99\". "
+        'Default value is "99". '
-        "Use \"--percentile-metrics\" to select metrics.",
+        'Use "--percentile-metrics" to select metrics.',
    )
    parser.add_argument(
        "--goodput",
        nargs="+",
        required=False,
-        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
        "pairs, where the key is a metric name, and the value is in "
-        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
        "separated by spaces. Allowed request level metric names are "
-        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
-        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
-    parser.add_argument("--no-structured-output",
-                        action='store_true',
+    parser.add_argument(
-                        default=False,
+        "--no-structured-output",
-                        help="Whether to disable JSON decoding or not.")
+        action="store_true",
-    parser.add_argument("--structured-output-ratio",
+        default=False,
-                        type=float,
+        help="Whether to disable JSON decoding or not.",
-                        default=1.0,
+    )
-                        help="Ratio of Structured Outputs requests")
+    parser.add_argument(
-    parser.add_argument("--structured-output-backend",
+        "--structured-output-ratio",
-                        type=str,
+        type=float,
-                        choices=[
+        default=1.0,
-                            "outlines", "lm-format-enforcer", "xgrammar",
+        help="Ratio of Structured Outputs requests",
-                            "guidance", "auto"
+    )
-                        ],
-                        default="auto",
-                        help="Backend to use for structured outputs")
    args = parser.parse_args()
    main(args)