Initial commit

1712ebff · jerrrrry · 1712ebff · 1712ebff · 1712ebff · 1712ebff
Commit 1712ebff authored Feb 06, 2026 by jerrrrry
18 changed files
--- a/1d.sh
+++ b/1d.sh
+export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')  #ip地址
+export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false
+export VLLM_FUSED_MOE_CHUNK_SIZE=8192
+export VLLM_USE_GLOBAL_CACHE13=1 #不影响性能，帮助解决oom
+export VLLM_ENABLE_TBO=0 #defalut
+export VLLM_ZERO_OVERHEAD=1
+current_time=$(date +"%Y%m%d-%H%M")
+
+#export VLLM_P2P_ASYNC=1
+
+
+
+vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \
+--host 0.0.0.0   \
+--port 20009 \
+--trust-remote-code \
+--dtype bfloat16 \
+-q slimquant_w4a8_marlin \
+--kv-cache-dtype fp8_e5m2 \
+--max-model-len 49152 \
+-tp 8 \
+--gpu-memory-utilization 0.95 \
+--max-num-seqs 256 \
+--block-size 64 \
+--disable-log-requests  \
+--max-num-batched-tokens 8192 \
+--no-enable-prefix-caching \
+--enable-chunked-prefill \
+--speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \
+--kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"1e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20009","send_type":"PUT_ASYNC","mem_pool_size_gb":256}}' \
+2>&1 | tee 1d_log/2d-${current_time}.log
+
--- a/1p.sh
+++ b/1p.sh
+export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')  #ip地址
+#export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof
+export VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD=1
+export USE_FUSED_RMS_QUANT=1 #启用rmsnorm+quant融合，其它优化默认已打开
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false
+export VLLM_SPEC_DECODE_EAGER=1
+
+export VLLM_USE_GLOBAL_CACHE13=1
+export VLLM_FUSED_MOE_CHUNK_SIZE=8192
+export SENDRECV_STREAM_WITH_COMPUTE=1
+export VLLM_ENABLE_TBO=0
+export VLLM_REJECT_SAMPLE_OPT=1
+export VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT=1
+export VLLM_ZERO_OVERHEAD=1
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+current_time=$(date +"%Y%m%d-%H%M")
+
+#-pp 2 -tp 4 \
+#--enable-expert-parallel
+vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \
+--port 20011 \
+--trust-remote-code  \
+--dtype bfloat16 \
+-q slimquant_w4a8_marlin \
+--kv-cache-dtype fp8_e5m2 \
+--max-model-len 49152 \
+--max-num-batched-tokens 8192 \
+-tp 8 \
+--gpu-memory-utilization 0.93 \
+--max-num-seqs 512 \
+--disable-log-requests \
+--block-size 64 \
+--enforce-eager \
+--no-enable-prefix-caching \
+--enable-chunked-prefill \
+--speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \
+#--kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20011","send_type":"PUT_ASYNC"}}' \
+2>&1 | tee 1p_log/1p-${current_time}.log
--- a/__pycache__/backend_request_func.cpython-310.pyc
+++ b/__pycache__/backend_request_func.cpython-310.pyc
--- a/__pycache__/benchmark_dataset.cpython-310.pyc
+++ b/__pycache__/benchmark_dataset.cpython-310.pyc
--- a/__pycache__/benchmark_utils.cpython-310.pyc
+++ b/__pycache__/benchmark_utils.cpython-310.pyc
--- a/backend_request_func.py
+++ b/backend_request_func.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+import aiohttp
+import huggingface_hub.constants
+from tqdm.asyncio import tqdm
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0]["text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = (
+                            "Unexpected response format: "
+                            "neither 'choices' nor 'text' found"
+                        )
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "repetition_penalty": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            if usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!"
+                        )
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "messages": [
+                {"role": "user", "content": content},
+            ],
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
+
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    )
+    "or `translations`."
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(
+                    url=api_url, data=form, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get("content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens"
+                                    )
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+        from modelscope import snapshot_download
+
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+            )
+
+            return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+        pretrained_model_name_or_path
+    ):
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "MistralTokenizer requires vllm package.\n"
+                "Please install it with `pip install vllm` "
+                "to use mistral tokenizer mode."
+            ) from e
+        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions, async_request_openai_chat_completions)
+]
--- a/benchmark_dataset.py
+++ b/benchmark_dataset.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+"""
+
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: Union[str, Any]
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
+        self.data = None
+
+    def apply_multimodal_chat_transformation(
+        self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
+    ) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific conversation
+        format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError("load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+            LoRA is selected.  max_loras (Optional[int]): The maximum number of
+            LoRAs available. If None, LoRA is not used.  lora_path
+            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
+            is not used.
+
+        Returns:
+            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
+            element is a LoRARequest (or None if not applicable) and the second
+            element is the tokenizer associated with the LoRA request (or the
+            base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(
+        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+    ) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+             for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(
+        self, requests: list[SampleRequest], num_requests: int
+    ) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests, k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (
+        prompt_too_short or output_too_short or prompt_too_long or combined_too_long
+    )
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(BytesIO(image["bytes"]))
+    if isinstance(image, Image.Image):
+        image = convert_image_mode(image, "RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+        }
+
+    if isinstance(image, str):
+        image_url = (
+            image if image.startswith(("http://", "file://")) else f"file://{image}"
+        )
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image"
+        " or str or dictionary with raw image bytes."
+    )
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 0.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        # Enforce range_ratio < 1
+        assert range_ratio < 1.0, (
+            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
+        )
+
+        vocab_size = tokenizer.vocab_size
+        num_special_tokens = tokenizer.num_special_tokens_to_add()
+        real_input_len = input_len - num_special_tokens
+
+        prefix_token_ids = (
+            np.random.randint(0, vocab_size, size=prefix_len).tolist()
+            if prefix_len > 0
+            else []
+        )
+
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
+        input_low = int(real_input_len * (1 - range_ratio))
+        input_high = int(real_input_len * (1 + range_ratio))
+        output_low = int(output_len * (1 - range_ratio))
+        output_high = int(output_len * (1 + range_ratio))
+
+        # Add logging for debugging
+        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
+        logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
+
+        input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
+        output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = (
+                (offsets[i] + i + np.arange(input_lens[i])) % vocab_size
+            ).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            # After decoding the prompt we have to encode and decode it again.
+            # This is done because in some cases N consecutive tokens
+            # give a string tokenized into != N number of tokens.
+            # For example for GPT2Tokenizer:
+            # [6880, 6881] -> ['Ġcalls', 'here'] ->
+            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
+            # To avoid uncontrolled change of the prompt length,
+            # the encoded sequence is truncated before being decode again.
+            total_input_len = prefix_len + int(input_lens[i])
+            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
+                :total_input_len
+            ]
+            prompt = tokenizer.decode(re_encoded_sequence)
+            total_input_len = len(re_encoded_sequence)
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                )
+            )
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry
+            for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = len(completion_ids) if output_len is None else output_len
+            if not is_valid_sequence(
+                prompt_len,
+                new_output_len,
+                skip_min_output_len_check=output_len is not None,
+            ):
+                continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                )
+            )
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# Custom Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class CustomDataset(BenchmarkDataset):
+    """
+    Implements the Custom dataset.  Loads data from a JSONL file and generates
+    sample requests based on conversation turns. E.g.,
+    ```
+    {"prompt": "What is the capital of India?"}
+    {"prompt": "What is the capital of Iran?"}
+    {"prompt": "What is the capital of China?"}
+    ```
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        # self.data will be a list of dictionaries
+        # e.g., [{"prompt": "What is the capital of India?"}, ...]
+        # This will be the standardized format which load_data()
+        # has to convert into depending on the filetype of dataset_path.
+        # sample() will assume this standardized format of self.data
+        self.data = []
+
+        # Load the JSONL file
+        if self.dataset_path.endswith(".jsonl"):
+            jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
+
+            # check if the JSONL file has a 'prompt' column
+            if "prompt" not in jsonl_data.columns:
+                raise ValueError("JSONL file must contain a 'prompt' column.")
+
+            # Convert each row to a dictionary and append to self.data
+            # This will convert the DataFrame to a list of dictionaries
+            # where each dictionary corresponds to a row in the DataFrame.
+            # This is the standardized format we want for self.data
+            for _, row in jsonl_data.iterrows():
+                self.data.append(row.to_dict())
+        else:
+            raise NotImplementedError(
+                "Only JSONL format is supported for CustomDataset."
+            )
+
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        skip_chat_template: bool = False,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["prompt"]
+
+            # apply template
+            if not skip_chat_template:
+                prompt = tokenizer.apply_chat_template(
+                    [{"role": "user", "content": prompt}],
+                    add_generation_prompt=True,
+                    tokenize=False,
+                )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(
+            base_msg, add_generation_prompt=True, tokenize=False
+        )
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset})."
+            )
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        while len(samples) < num_requests:
+            extra_lines = random.choices(
+                self.data, k=num_input_lines - num_prefix_lines
+            )
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False
+            )
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            if prompt_len <= input_len:
+                samples.append(
+                    SampleRequest(
+                        prompt=prompt_formatted if return_prompt_formatted else prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    )
+                )
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(
+        self,
+    ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests, random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
+            )
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                )
+            )
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Base Implementation
+# -----------------------------------------------------------------------------
+class HuggingFaceDataset(BenchmarkDataset):
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+    def __init__(
+        self,
+        dataset_path: str,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+        self.load_data()
+
+    def load_data(self) -> None:
+        """Load data from HuggingFace datasets."""
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+
+    SUPPORTED_DATASET_PATHS = {
+        "lmms-lab/LLaVA-OneVision-Data",
+        "Aeala/ShareGPT_Vicuna_unfiltered",
+    }
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in filtered_data:
+            if len(sampled_requests) >= num_requests:
+                break
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
+                continue
+            mm_content = process_image(item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(HuggingFaceDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
+    }
+    IS_MULTIMODAL = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
+            mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
+            the code, do not include any explanation."
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# MT-Bench Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class MTBenchDataset(HuggingFaceDataset):
+    """
+    MT-Bench Dataset.
+    https://huggingface.co/datasets/philschmid/mt-bench
+
+    We create a single turn dataset for MT-Bench.
+    This is similar to Spec decoding benchmark setup in vLLM
+    https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
+    """  # noqa: E501
+
+    DEFAULT_OUTPUT_LEN = 256  # avg len used in SD bench in vLLM
+    SUPPORTED_DATASET_PATHS = {
+        "philschmid/mt-bench",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        sampled_requests = []
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0]
+
+            # apply template
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime",
+        "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT",
+    }
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item["problem"], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
+            ):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                )
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Next Edit Prediction Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+zeta_prompt = """### Instruction:
+You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
+
+### User Edits:
+
+{}
+
+### User Excerpt:
+
+{}
+
+### Response:
+
+"""  # noqa: E501
+
+
+def _format_zeta_prompt(
+    sample: dict, original_start_marker: str = "<|editable_region_start|>"
+) -> dict:
+    """Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
+
+    This function formats examples from the NEP dataset
+    into prompts and expected outputs. It could be
+    further extended to support more NEP datasets.
+
+    Args:
+        sample: The dataset sample containing events,
+            inputs, and outputs.
+        original_start_marker: The marker indicating the
+            start of the editable region. Defaults to
+            "<|editable_region_start|>".
+
+    Returns:
+        A dictionary with the formatted prompts and expected outputs.
+    """
+    events = sample["events"]
+    input = sample["input"]
+    output = sample["output"]
+    prompt = zeta_prompt.format(events, input)
+
+    # following the original implementation, extract the focused region
+    # from the raw output
+    output_start_index = output.find(original_start_marker)
+    output_focused_region = output[output_start_index:]
+    expected_output = output_focused_region
+
+    return {"prompt": prompt, "expected_output": expected_output}
+
+
+class NextEditPredictionDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a Next Edit Prediction dataset.
+    """
+
+    SUPPORTED_DATASET_PATHS = {
+        "zed-industries/zeta",
+    }
+    MAPPING_PROMPT_FUNCS = {
+        "zed-industries/zeta": _format_zeta_prompt,
+    }
+
+    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+        formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
+        if formatting_prompt_func is None:
+            raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
+        samples = []
+        for sample in self.data:
+            sample = formatting_prompt_func(sample)
+            samples.append(
+                SampleRequest(
+                    prompt=sample["prompt"],
+                    prompt_len=len(tokenizer(sample["prompt"]).input_ids),
+                    expected_output_len=len(
+                        tokenizer(sample["expected_output"]).input_ids
+                    ),
+                )
+            )
+            if len(samples) >= num_requests:
+                break
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# ASR Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ASRDataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a ASR dataset for transcription.
+    Tested on the following set:
+
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
+    |                |                                        |                          | release3-speaker-adaptation |
+    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
+    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
+    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
+    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
+    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
+    +----------------+----------------------------------------+--------------------------+-----------------------------+
+
+    """  # noqa: E501
+
+    SUPPORTED_DATASET_PATHS = {
+        "openslr/librispeech_asr",
+        "facebook/voxpopuli",
+        "LIUM/tedlium",
+        "edinburghcstr/ami",
+        "speechcolab/gigaspeech",
+        "kensho/spgispeech",
+    }
+
+    DEFAULT_OUTPUT_LEN = 128
+    IS_MULTIMODAL = True
+
+    # TODO Whisper-specific. Abstract interface when more models are supported.
+    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+    skip_long_audios: bool = True
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        **kwargs,
+    ) -> list:
+        import librosa
+
+        output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
+        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
+        prompt_len = len(tokenizer(prompt).input_ids)
+        sampled_requests = []
+        skipped = 0
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            audio = item["audio"]
+            y, sr = audio["array"], audio["sampling_rate"]
+            duration_s = librosa.get_duration(y=y, sr=sr)
+            # Whisper max supported duration
+            if self.skip_long_audios and duration_s > 30:
+                skipped += 1
+                continue
+
+            mm_content = {"audio": (y, sr)}
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                )
+            )
+        if skipped:
+            logger.warning(
+                "%d samples discarded from dataset due to"
+                " their length being greater than"
+                " what Whisper supports.",
+                skipped,
+            )
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
--- a/benchmark_guided.py
+++ b/benchmark_guided.py
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark guided decoding throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from typing import List
+
+import datasets
+import pandas as pd
+import uvloop
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str = 'json'
+    completion: str = None
+
+
+def run_vllm(requests: List[SampleRequest],
+             engine_args: EngineArgs,
+             n: int,
+             guided_decoding_rate: float = 1.0,
+             warmup: bool = False) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**vars(engine_args))
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    # create a list containing random selected true or false
+    guided_decoding_req_idx = random.sample(
+        range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+    if warmup:
+        print(">>>>> Running warmup prompt, for the first 5")
+        # We setup the first 5 requests to warmup FSM
+        # if using xgrammar dataset, we will skip warmup
+        warmup_requests = requests[:5]
+        for i, request in enumerate(warmup_requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if guided_decoding_rate > 0 else None,
+                ))
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+
+    print(">>>>> Benchmark started...")
+    prompts = []
+    sampling_params = []
+    for i, request in enumerate(requests):
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                guided_decoding=GuidedDecodingParams(
+                    **{request.structure_type: request.schema})
+                if i in guided_decoding_req_idx else None,
+            ))
+
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+    ret = []
+    for output, request in zip(outputs, requests):
+        generated_text = output.outputs[0].text
+        ret.append({
+            "generated": generated_text,
+            "expected": request.completion
+        })
+    end = time.perf_counter()
+    return end - start, ret
+
+
+async def run_vllm_async(
+        requests: List[SampleRequest],
+        engine_args: AsyncEngineArgs,
+        n: int,
+        guided_decoding_rate: float = 1.0,
+        warmup: bool = False,
+        disable_frontend_multiprocessing: bool = False) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        guided_decoding_req_idx = random.sample(
+            range(len(requests)), int(len(requests) * guided_decoding_rate))
+
+        if warmup:
+            print(">>>>>> Running warmup prompt, for the first 5")
+            # We setup the first 5 requests to warmup FSM
+            # if using xgrammar dataset, we will skip warmup
+            warmup_requests = requests[:5]
+            for i, request in enumerate(warmup_requests):
+                prompts.append(request.prompt)
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                        guided_decoding=GuidedDecodingParams(
+                            json=request.schema)
+                        if guided_decoding_rate > 0 else None,
+                    ))
+            generators = []
+            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+                generator = llm.generate(prompt, sp, request_id=f"test{i}")
+                generators.append(generator)
+            all_gens = merge_async_iterators(*generators)
+            async for i, res in all_gens:
+                pass
+
+        print(">>>>> Benchmark started...")
+        prompts = []
+        sampling_params = []
+        for i, request in enumerate(requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if i in guided_decoding_req_idx else None,
+                ))
+
+        generators = []
+        start_time = []
+        latencies = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+            start_time.append(time.perf_counter())
+            latencies.append([])
+        all_gens = merge_async_iterators(*generators)
+        generated_texts = [''] * len(requests)
+        async for i, res in all_gens:
+            generated_texts[i] = res.outputs[0].text
+            lat = time.perf_counter() - start_time[i]
+            latencies[i].append(lat)
+        ret = [{
+            'generated': gt,
+            'expected': req.completion
+        } for gt, req in zip(generated_texts, requests)]
+        end = time.perf_counter()
+        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
+        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
+                                  for lat in latencies])
+        return end - start, ret, (first_latency, next_latency)
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+
+            ?select_statement: "SELECT " column_list " FROM " table_name
+
+            ?column_list: column_name ("," column_name)*
+
+            ?table_name: identifier
+
+            ?column_name: identifier
+
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        args.warmup = False
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              completion=completion))
+
+    return requests
+
+
+def evaluate(ret, args):
+
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # async engine is working for 'regex', 'choice' and 'grammar'
+    if args.dataset == 'grammar':
+        args.structure_type = 'grammar'
+        args.async_engine = False
+    elif args.dataset == 'regex':
+        args.structure_type = 'regex'
+        args.async_engine = False
+    elif args.dataset == 'choice':
+        args.structure_type = 'choice'
+        args.async_engine = False
+    else:
+        args.structure_type = 'json'
+
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += f"_async{args.async_engine}"
+        result_file_name += f"_warmup{args.warmup}"
+        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    # Synthesize a prompt with the given input length.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = sample_requests(tokenizer, args)
+
+    if args.async_engine:
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
+            run_vllm_async(requests, engine_args, args.n,
+                           args.guided_decoding_ratio, args.warmup,
+                           args.disable_frontend_multiprocessing))
+    else:
+        engine_args = EngineArgs.from_cli_args(args)
+        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
+                                     args.guided_decoding_ratio, args.warmup)
+        first_latency, next_latency = None, None
+
+    score = evaluate(ret, args)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if first_latency is not None:
+        latency_breakdown = "\nFirst token latency(msecs):\n"
+        latency_breakdown += f"{first_latency.describe()}"
+        latency_breakdown += "\nNext token latency(msecs):\n"
+        latency_breakdown += f"{next_latency.describe()}"
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
+        f"Correct rate is {score} %",
+        f"{latency_breakdown if first_latency is not None else ''}")
+
+    # Output JSON results if specified
+    if args.output_json or result_file_name:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "total_output_tokens": total_output_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
+            "output_tokens_per_second":
+            f"{total_output_tokens / elapsed_time:.2f}",
+            "correct_rate(%)": score
+        }
+        results = {"outputs": ret, **results}
+        if first_latency is not None:
+            results["first_token_latency(msecs)"] = first_latency.describe(
+            ).to_dict()
+            results["next_token_latency(msecs)"] = next_latency.describe(
+            ).to_dict()
+        if args.output_json:
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+        elif result_file_name:
+            with open(result_file_name, "w") as f:
+                json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=512,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=10,
+                        help="Number of prompts to process.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument("--warmup",
+                        action="store_true",
+                        default=False,
+                        help="Run warmup prompts before benchmark.")
+    parser.add_argument("--save-results",
+                        action="store_true",
+                        default=False,
+                        help="save output results.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    main(args)
\ No newline at end of file
--- a/benchmark_latency.py
+++ b/benchmark_latency.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from typing import Any, Optional
+
+import numpy as np
+from tqdm import tqdm
+
+import vllm.envs as envs
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
+    )
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len + args.output_len
+    ), (
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile:
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler."
+        )
+    main(args)
--- a/benchmark_long_document_qa_throughput.py
+++ b/benchmark_long_document_qa_throughput.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == "random":
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == "tile":
+        return prompts * repeat_count
+    elif mode == "interleave":
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(
+            f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
+        )
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance with or "
+        "without automatic prefix caching."
+    )
+
+    parser.add_argument(
+        "--document-length",
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )
+
+    parser.add_argument(
+        "--num-documents",
+        type=int,
+        default=8,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )
+
+    parser.add_argument("--output-len", type=int, default=10)
+
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=2,
+        help="Number of times to repeat each prompt",
+    )
+
+    parser.add_argument(
+        "--repeat-mode",
+        type=str,
+        default="random",
+        help="The mode to repeat prompts. The supported "
+        'modes are "random", "tile", and "interleave". '
+        "See repeat_prompts() in the source code for details.",
+    )
+
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=0,
+        help='Random seed when the repeat mode is "random"',
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
--- a/benchmark_prefix_caching.py
+++ b/benchmark_prefix_caching.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the efficiency of prefix caching.
+
+This script allows you to benchmark the performance of
+a model with and without prefix caching using either fixed prompts
+or prompts sampled from the ShareGPT dataset.
+
+Fixed example usage:
+    python benchmark_prefix_caching.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-prompts 1 \
+        --repeat-count 100 \
+        --input-length-range 128:256
+
+ShareGPT example usage:
+    # This command samples 20 prompts with input lengths
+    # between 128 and 256 tokens from the ShareGPT dataset,
+    # then replicates each prompt 5 times.
+    python benchmark_prefix_caching.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
+        --enable-prefix-caching \
+        --num-prompts 20 \
+        --repeat-count 5 \
+        --input-length-range 128:256
+"""
+
+import dataclasses
+import json
+import random
+import time
+from typing import Optional
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+# import triton
+
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+# triton_version = triton.__version__
+# if triton_version.startswith("2.1"):
+#     from triton.common.backend import compute_core_version_key
+# elif triton_version.startswith("3.0"):
+#     from triton.compiler.compiler import triton_key
+# else:
+#     print(f"TRITON version {triton_version} is not specifically handled.")
+
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    # if triton_version.startswith("2.1"):
+    #     version_key = compute_core_version_key()
+    # if triton_version.startswith("3.0"):
+    #     version_key = triton_key()
+    start_time = time.time()
+
+    llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+@dataclasses.dataclass
+class Request:
+    prompt: str
+    prompt_len: int
+    output_len: int
+
+
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
+    vocab = tokenizer.get_vocab()
+    all_special_ids = set(tokenizer.all_special_ids)
+
+    # Remove the special tokens.
+    return random.choices(
+        [v for k, v in vocab.items() if k not in all_special_ids],
+        k=length,
+    )
+
+
+def sample_requests_from_dataset(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: tuple[int, int],
+    fixed_output_len: Optional[int],
+) -> list[Request]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    min_len, max_len = input_length_range
+    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
+
+    # Filter out sequences that are too long or too short
+    filtered_requests: list[Request] = []
+
+    for i in range(len(dataset)):
+        if len(filtered_requests) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if min_len <= prompt_len <= max_len:
+            filtered_requests.append(Request(prompt, prompt_len, output_len))
+
+    return filtered_requests
+
+
+def sample_requests_from_random(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: tuple[int, int],
+    fixed_output_len: Optional[int],
+    prefix_len: int,
+) -> list[Request]:
+    requests = []
+    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+    min_len, max_len = input_length_range
+
+    for i in range(num_requests):
+        unique_part_token_ids = sample_tokens(
+            tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
+        )
+        prompt_token_ids = prefix_token_ids + unique_part_token_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        prompt_len = len(prompt_token_ids)
+        assert min_len <= prompt_len <= max_len, (
+            f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        )
+        requests.append(Request(prompt, prompt_len, fixed_output_len))
+    return requests
+
+
+def repeat_and_sort_requests(
+    requests: list[Request], repeat_count: int, sort: bool = False
+) -> list[str]:
+    repeated_requests = requests * repeat_count
+    if sort:
+        repeated_requests.sort(key=lambda x: x[1])
+    else:
+        random.shuffle(repeated_requests)
+    return [req.prompt for req in repeated_requests]
+
+
+def main(args):
+    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
+    input_length_range = tuple(map(int, args.input_length_range.split(":")))
+    random.seed(args.seed)
+    if args.dataset_path is not None:
+        if args.prefix_len > 0:
+            raise ValueError(
+                "prefix-len is not supported when dataset-path is provided."
+            )
+        print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
+        filtered_requests = sample_requests_from_dataset(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+        )
+    else:
+        print(f"Start to sample {args.num_prompts} prompts from random")
+        filtered_requests = sample_requests_from_random(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+            prefix_len=args.prefix_len,
+        )
+
+    # Print some helpful stats of the requests.
+    print(f"Sampled {len(filtered_requests)} requests.")
+    prompt_lens = [req.prompt_len for req in filtered_requests]
+    print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+    print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+    print(f"Min Prompt Length: {min(prompt_lens)}")
+    print(f"Max Prompt Length: {max(prompt_lens)}")
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+
+    print("Testing filtered requests")
+    prompts = repeat_and_sort_requests(
+        filtered_requests, repeat_count=args.repeat_count, sort=args.sort
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance with or without "
+        "automatic prefix caching."
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        required=True,
+        help="Number of the prompts sampled from dataset",
+    )
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=1,
+        help="Number of times to repeat each prompt",
+    )
+    parser.add_argument(
+        "--sort", action="store_true", help="Sort prompts by input length"
+    )
+    parser.add_argument(
+        "--input-length-range",
+        type=str,
+        required=True,
+        help="Range of input lengths for sampling prompts,"
+        'specified as "min:max" (e.g., "128:256").',
+    )
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Specifies the length of a common prefix to be "
+        "added to the input prompt. The input-length-range will "
+        "subtract this length when filtering prompts. Only used "
+        "when dataset-path is not provided.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
--- a/benchmark_prioritization.py
+++ b/benchmark_prioritization.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline prioritization."""
+
+import argparse
+import dataclasses
+import json
+import random
+import time
+from typing import Optional
+
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+# Select a equi-probable random priority
+def get_random_flag():
+    return 0 if random.random() < 0.5 else 1
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> list[tuple[str, int, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: list[tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+
+        priority = get_random_flag()
+
+        filtered_dataset.append((prompt, prompt_len, output_len, priority))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: list[tuple[str, int, int]],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " input_len and output_len for all requests."
+    )
+
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    priority = []
+    for prompt, _, output_len, _priority in requests:
+        prompts.append(prompt)
+        priority.append(_priority)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [
+            (prompt, args.input_len, args.output_len, get_random_flag())
+            for _ in range(args.num_prompts)
+        ]
+    else:
+        requests = sample_requests(
+            args.dataset, args.num_prompts, tokenizer, args.output_len
+        )
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(
+        prompt_len + output_len for _, prompt_len, output_len, priority in requests
+    )
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} tokens/s"
+    )
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=200, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    main(args)
\ No newline at end of file
--- a/benchmark_serving.py
+++ b/benchmark_serving.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+r"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands:
+    vLLM OpenAI API server
+    vllm serve <your_model> \
+        --swap-space 16 \
+        --disable-log-requests
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+
+import argparse
+import asyncio
+import gc
+import json
+import os
+import random
+import time
+import warnings
+from collections.abc import AsyncGenerator, Iterable
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Literal, Optional
+
+import numpy as np
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+from benchmark_dataset import (
+    AIMODataset,
+    ASRDataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    CustomDataset,
+    HuggingFaceDataset,
+    InstructCoderDataset,
+    MTBenchDataset,
+    NextEditPredictionDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+def _get_current_request_rate(
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
+    ramp_up_start_rps: Optional[int],
+    ramp_up_end_rps: Optional[int],
+    request_index: int,
+    total_requests: int,
+    request_rate: float,
+) -> float:
+    if (
+        ramp_up_strategy
+        and ramp_up_start_rps is not None
+        and ramp_up_end_rps is not None
+    ):
+        progress = request_index / max(total_requests - 1, 1)
+        if ramp_up_strategy == "linear":
+            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
+            return ramp_up_start_rps + increase
+        elif ramp_up_strategy == "exponential":
+            ratio = ramp_up_end_rps / ramp_up_start_rps
+            return ramp_up_start_rps * (ratio**progress)
+        else:
+            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
+    return request_rate
+
+
+async def get_request(
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+) -> AsyncGenerator[tuple[SampleRequest, float], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a SampleRequest.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+         ramp_up_strategy (optional):
+            The ramp-up strategy. Can be "linear" or "exponential".
+            If None, uses constant request rate (specified by request_rate).
+        ramp_up_start_rps (optional):
+            The starting request rate for ramp-up.
+        ramp_up_end_rps (optional):
+            The ending request rate for ramp-up.
+    """
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
+    # Convert to list to get length for ramp-up calculations
+    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
+        input_requests = list(input_requests)
+
+    total_requests = len(input_requests)
+    request_index = 0
+
+    for request in input_requests:
+        current_request_rate = _get_current_request_rate(
+            ramp_up_strategy,
+            ramp_up_start_rps,
+            ramp_up_end_rps,
+            request_index,
+            total_requests,
+            request_rate,
+        )
+
+        yield request, current_request_rate
+
+        request_index += 1
+
+        if current_request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        theta = 1.0 / (current_request_rate * burstiness)
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: list[SampleRequest],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    goodput_config_dict: dict[str, float],
+) -> tuple[BenchmarkMetrics, list[int]]:
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_tokens
+
+            if not output_len:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(
+                        outputs[i].generated_text, add_special_tokens=False
+                    ).input_ids
+                )
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    model_name: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[SampleRequest],
+    logprobs: Optional[int],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    ignore_eos: bool,
+    goodput_config_dict: dict[str, float],
+    max_concurrency: Optional[int],
+    lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0].prompt,
+        input_requests[0].prompt_len,
+        input_requests[0].expected_output_len,
+        input_requests[0].multi_modal_data,
+    )
+
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
+    test_input = RequestFuncInput(
+        model=model_id,
+        model_name=model_name,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        logprobs=logprobs,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
+        extra_body=extra_body,
+    )
+
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter(
+            [random.choice(lora_modules) for _ in range(len(input_requests))]
+        )
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            multi_modal_content=test_mm_content,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
+
+    if ramp_up_strategy is not None:
+        print(
+            f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
+            f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
+            "the duration of the benchmark."
+        )
+    else:
+        print(f"Traffic request rate: {request_rate} RPS.")
+
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+
+    rps_change_events = []
+    last_int_rps = -1
+    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
+        last_int_rps = ramp_up_start_rps
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )
+
+    async for request, current_request_rate in get_request(
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
+        if ramp_up_strategy is not None:
+            current_int_rps = int(current_request_rate)
+            if current_int_rps > last_int_rps:
+                timestamp = datetime.now().isoformat()
+                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
+                last_int_rps = current_int_rps
+
+        prompt, prompt_len, output_len, mm_content = (
+            request.prompt,
+            request.prompt_len,
+            request.expected_output_len,
+            request.multi_modal_data,
+        )
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            logprobs=logprobs,
+            multi_modal_content=mm_content,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
+        tasks.append(asyncio.create_task(task))
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    if goodput_config_dict:
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "request_goodput": metrics.request_goodput if goodput_config_dict else None,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    if rps_change_events:
+        result["rps_change_events"] = rps_change_events
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result
+
+
+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. "
+                )
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative."
+                )
+    return goodput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds."
+        ) from err
+    return goodput_config_dict
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any], file_name: str
+) -> None:
+    metrics = [
+        "median_ttft_ms",
+        "mean_ttft_ms",
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
+    ]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: [results[k]] for k in metrics},
+        extra_info={
+            k: results[k]
+            for k in results
+            if k not in metrics and k not in ignored_metrics
+        },
+    )
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
+
+    # Validate ramp-up arguments
+    if args.ramp_up_strategy is not None:
+        if args.request_rate != float("inf"):
+            raise ValueError(
+                "When using ramp-up, do not specify --request-rate. "
+                "The request rate will be controlled by ramp-up parameters. "
+                "Please remove the --request-rate argument."
+            )
+        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
+            raise ValueError(
+                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
+                "--ramp-up-end-rps must be specified"
+            )
+        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
+            raise ValueError("Ramp-up start and end RPS must be non-negative")
+        if args.ramp_up_start_rps > args.ramp_up_end_rps:
+            raise ValueError("Ramp-up start RPS must be less than end RPS")
+        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
+            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        tokenizer_mode=tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )
+
+    if args.dataset_name is None:
+        raise ValueError(
+            "Please specify '--dataset-name' and the corresponding "
+            "'--dataset-path' if required."
+        )
+
+    if args.dataset_name == "custom":
+        dataset = CustomDataset(dataset_path=args.dataset_path)
+        input_requests = dataset.sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.custom_output_len,
+            skip_chat_template=args.custom_skip_chat_template,
+        )
+
+    elif args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
+        if args.backend == "openai-chat":
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=False,
+            )
+        else:
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset."
+            )
+            input_requests = dataset.sample(
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+                return_prompt_formatted=True,
+            )
+
+    elif args.dataset_name == "hf":
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = VisionArenaDataset
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+        elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = MTBenchDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS:  # noqa: E501
+            dataset_class = NextEditPredictionDataset
+            args.hf_split = "train"
+        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ASRDataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set(
+                [
+                    dataset_name
+                    for cls in HuggingFaceDataset.__subclasses__()
+                    for dataset_name in cls.SUPPORTED_DATASET_PATHS
+                ]
+            )
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats."
+            )
+
+        if dataset_class.IS_MULTIMODAL and backend not in [
+            "openai-chat",
+            "openai-audio",
+        ]:
+            # multi-modal benchmark is only available on OpenAI Chat backend.
+            raise ValueError(
+                "Multi-modal content is only supported on 'openai-chat' and "
+                "'openai-audio' backend."
+            )
+        input_requests = dataset_class(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            random_seed=args.seed,
+        ).sample(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            output_len=args.hf_output_len,
+        )
+
+    else:
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt": lambda: ShareGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                output_len=args.sharegpt_output_len,
+            ),
+            "burstgpt": lambda: BurstGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            ),
+        }
+
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
+    goodput_config_dict = check_goodput_args(args)
+
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature,
+        }.items()
+        if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible backends."
+        )
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
+    if args.backend == "llama.cpp":
+        # Disable prompt caching in llama.cpp backend
+        sampling_params["cache_prompt"] = False
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            model_name=model_name,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            logprobs=args.logprobs,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
+            ignore_eos=args.ignore_eos,
+            goodput_config_dict=goodput_config_dict,
+            max_concurrency=args.max_concurrency,
+            lora_modules=args.lora_modules,
+            extra_body=sampling_params,
+            ramp_up_strategy=args.ramp_up_strategy,
+            ramp_up_start_rps=args.ramp_up_start_rps,
+            ramp_up_end_rps=args.ramp_up_end_rps,
+        )
+    )
+
+    # Save config and results to json
+    if args.save_result or args.append_result:
+        result_json: dict[str, Any] = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["backend"] = backend
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf"
+        )
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        if args.ramp_up_strategy is not None:
+            result_json["ramp_up_strategy"] = args.ramp_up_strategy
+            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
+            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                "input_lens",
+                "output_lens",
+                "ttfts",
+                "itls",
+                "generated_texts",
+                "errors",
+            ]:
+                if field in result_json:
+                    del result_json[field]
+                if field in benchmark_result:
+                    del benchmark_result[field]
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        max_concurrency_str = (
+            f"-concurrency{args.max_concurrency}"
+            if args.max_concurrency is not None
+            else ""
+        )
+        if args.ramp_up_strategy is not None:
+            file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        else:
+            file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        if args.result_filename:
+            file_name = args.result_filename
+        if args.result_dir:
+            os.makedirs(args.result_dir, exist_ok=True)
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(
+            file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
+        ) as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
+            json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help=(
+            "Number of logprobs-per-token to compute & return as part of "
+            "the request. If unspecified, then either (1) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-separated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-separated list of percentiles for selected metrics. "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
+        "pairs, where the key is a metric name, and the value is in "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
+        "separated by spaces. Allowed request level metric names are "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    # group for dataset specific arguments
+    custom_group = parser.add_argument_group("custom dataset options")
+    custom_group.add_argument(
+        "--custom-output-len",
+        type=int,
+        default=256,
+        help="Number of output tokens per request, used only for custom dataset.",
+    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help="Skip applying chat template to prompt, used only for custom dataset.",
+    )
+
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help="Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help="Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help="Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.",
+    )
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help=(
+            "Number of fixed prefix tokens before the random context "
+            "in a request. "
+            "The total input length is the sum of `random-prefix-len` and "
+            "a random "
+            "context length sampled from [input_len * (1 - range_ratio), "
+            "input_len * (1 + range_ratio)]."
+        ),
+    )
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
+    )
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).",
+    )
+
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        choices=["auto", "slow", "mistral", "custom"],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        "always use the slow tokenizer. \n* "
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )
+
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )
+
+    parser.add_argument(
+        "--lora-modules",
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )
+
+    parser.add_argument(
+        "--ramp-up-strategy",
+        type=str,
+        default=None,
+        choices=["linear", "exponential"],
+        help="The ramp-up strategy. This would be used to "
+        "ramp up the request rate from initial RPS to final "
+        "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
+        "over the duration of the benchmark.",
+    )
+    parser.add_argument(
+        "--ramp-up-start-rps",
+        type=int,
+        default=None,
+        help="The starting request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+    parser.add_argument(
+        "--ramp-up-end-rps",
+        type=int,
+        default=None,
+        help="The ending request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
--- a/benchmark_serving_structured_output.py
+++ b/benchmark_serving_structured_output.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+r"""Benchmark online serving throughput with structured outputs.
+
+On the server side, run one of the following commands:
+    (vLLM OpenAI API server)
+    vllm serve <your_model> --disable-log-requests
+
+On the client side, run:
+    python benchmarks/benchmark_serving_structured_output.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset json \
+        --structured-output-ratio 1.0 \
+        --request-rate 10 \
+        --num-prompts 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+
+import argparse
+import asyncio
+import copy
+import dataclasses
+import json
+import os
+import random
+import time
+import uuid
+import warnings
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass
+from typing import Optional
+
+import datasets
+import numpy as np
+import pandas as pd
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+from vllm.v1.structured_output.backend_xgrammar import (
+    has_xgrammar_unsupported_json_features,
+)
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str
+    completion: str = None
+
+
+def sample_requests(
+    tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
+) -> list[SampleRequest]:
+    if args.dataset == "json" or args.dataset == "json-unique":
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(
+                dir_path, "structured_schemas", "structured_schema_1.json"
+            )
+        json_schemas = []
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+
+        if args.dataset == "json-unique":
+            json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
+            for i in range(len(json_schemas)):
+                if "properties" not in json_schemas[i]:
+                    json_schemas[i]["properties"] = {}
+                json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
+                    "type": "string",
+                    "description": "An unique optional field to avoid cached schemas",
+                }
+        else:
+            json_schemas = [schema] * args.num_prompts
+
+        def gen_prompt(index: int):
+            return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
+
+        def get_schema(index: int):
+            return json_schemas[index % len(json_schemas)]
+
+        requests = [
+            SampleRequest(
+                prompt=gen_prompt(i),
+                prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
+                expected_output_len=args.output_len,
+                schema=get_schema(i),
+                structure_type=args.structure_type,
+            )
+            for i in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "grammar":
+        schema = """
+        root ::= select_statement
+
+        select_statement ::= "SELECT " column " from " table " where " condition
+
+        column ::= "col_1 " | "col_2 "
+
+        table ::= "table_1 " | "table_2 "
+
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=schema,
+                structure_type=args.structure_type,
+            )
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=regex,
+                structure_type=args.structure_type,
+            )
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=input_len,
+                expected_output_len=args.output_len,
+                schema=choice,
+                structure_type=args.structure_type,
+            )
+            for _ in range(args.num_prompts)
+        ]
+
+    elif args.dataset == "xgrammar_bench":
+        requests: list[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
+        full_dataset_len = len(dataset)
+
+        def _filter_func(item):
+            import json
+
+            schema = json.loads(item["schema"])
+            return not has_xgrammar_unsupported_json_features(schema)
+
+        dataset = dataset.filter(_filter_func)
+        num_filtered_out = full_dataset_len - len(dataset)
+        print(
+            f"dataset has {len(dataset)} entries after filtering "
+            f"out {num_filtered_out} entries with unsupported features"
+        )
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(
+                dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
+            )
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=args.output_len,
+                    schema=schema,
+                    structure_type=args.structure_type,
+                    completion=completion,
+                )
+            )
+
+    return requests
+
+
+async def get_request(
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[tuple[int, SampleRequest], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a tuple.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
+    theta = 1.0 / (request_rate * burstiness)
+
+    for i, request in enumerate(input_requests):
+        yield i, request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: list[tuple[str, int, int]],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    goodput_config_dict: Optional[dict[str, float]] = None,
+) -> tuple[BenchmarkMetrics, list[int]]:
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note : this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
+            )
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i].prompt_len
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            outputs[i].tpot = tpot
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[SampleRequest],
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[str],
+    ignore_eos: bool,
+    max_concurrency: Optional[int],
+    structured_output_ratio: float,
+    goodput_config_dict: Optional[dict[str, float]] = None,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    def prepare_extra_body(request) -> dict:
+        extra_body = {}
+        # Add the schema to the extra_body
+        extra_body[request.structure_type] = request.schema
+        return extra_body
+
+    print("Starting initial single prompt test run...")
+    structured_output_req_idx = random.sample(
+        range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
+    )
+
+    test_request = input_requests[0]
+    test_req_extra_body = (
+        prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
+    )
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_request.prompt,
+        api_url=api_url,
+        prompt_len=test_request.prompt_len,
+        output_len=test_request.expected_output_len,
+        ignore_eos=ignore_eos,
+        extra_body=test_req_extra_body,
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=test_req_extra_body,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+    expected: list[str] = []
+    async for i, request in get_request(input_requests, request_rate, burstiness):
+        extra_body = (
+            prepare_extra_body(request) if i in structured_output_req_idx else None
+        )
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=request.prompt,
+            api_url=api_url,
+            prompt_len=request.prompt_len,
+            output_len=request.expected_output_len,
+            ignore_eos=ignore_eos,
+            extra_body=extra_body,
+        )
+        expected.append(request.completion)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_request.prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_request.prompt_len,
+            output_len=test_request.expected_output_len,
+            extra_body={test_request.structure_type: test_request.schema},
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    if goodput_config_dict:
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "ttft_description": pd.Series([output.ttft for output in outputs])
+        .describe()
+        .to_dict(),
+        "tpot_description": pd.Series([output.tpot for output in outputs])
+        .describe()
+        .to_dict(),
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    ret = [
+        {"generated": output.generated_text, "expected": gt}
+        for output, gt in zip(outputs, expected)
+    ]
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result, ret
+
+
+def evaluate(ret, args):
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import regex as re
+
+        actual = actual.replace("\n", "").replace(" ", "").strip()
+        try:
+            actual = re.search(r"\{.*\}", actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+
+        return True
+
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+
+    def _eval_correctness_regex(expected, actual):
+        import regex as re
+
+        return re.match(args.regex, actual) is not None
+
+    def _eval_correctness(expected, actual):
+        if args.structure_type == "guided_json":
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == "guided_regex":
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == "guided_choice":
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res["expected"], res["generated"])
+        res["correctness"] = score
+        scores.append(score)
+
+    not_none_scores = [score for score in scores if score is not None]
+
+    return (
+        (sum(not_none_scores) / len(not_none_scores) * 100)
+        if len(not_none_scores) > 0
+        else None
+    )
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds."
+        ) from err
+    return goodput_config_dict
+
+
+def check_goodput_args(args):
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. "
+                )
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative."
+                )
+    return goodput_config_dict
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )
+
+    if args.dataset == "grammar":
+        args.structure_type = "guided_grammar"
+    elif args.dataset == "regex":
+        args.structure_type = "guided_regex"
+    elif args.dataset == "choice":
+        args.structure_type = "guided_choice"
+    else:
+        args.structure_type = "guided_json"
+
+    if args.no_structured_output:
+        args.structured_output_ratio = 0
+    if args.save_results:
+        result_file_name = f"{args.structured_output_ratio}guided"
+        result_file_name += f"_{backend}"
+        result_file_name += f"_{args.request_rate}qps"
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+
+    input_requests = sample_requests(tokenizer, args)
+
+    goodput_config_dict = check_goodput_args(args)
+
+    benchmark_result, ret = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
+            ignore_eos=args.ignore_eos,
+            max_concurrency=args.max_concurrency,
+            structured_output_ratio=args.structured_output_ratio,
+            goodput_config_dict=goodput_config_dict,
+        )
+    )
+
+    # Save config and results to json
+    score = evaluate(ret, args)
+    print("correct_rate(%)", score, "\n")
+    if args.save_results:
+        results = {
+            "backend": backend,
+            "model_id": model_id,
+            "tokenizer_id": tokenizer_id,
+            "num_prompts": args.num_prompts,
+            "request_rate": args.request_rate
+            if args.request_rate < float("inf")
+            else "inf",
+            "burstiness": args.burstiness,
+            "max_concurrency": args.max_concurrency,
+            "correct_rate(%)": score,
+        }
+        results = {"outputs": ret, **results, **benchmark_result}
+
+        # Save to file
+        if args.result_filename:
+            result_file_name = args.result_filename
+        if args.result_dir:
+            result_file_name = os.path.join(args.result_dir, result_file_name)
+        with open(result_file_name, "w", encoding="utf-8") as outfile:
+            json.dump(results, outfile, indent=4)
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default="json",
+        choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
+    )
+    parser.add_argument(
+        "--json-schema-path", type=str, default=None, help="Path to json schema."
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-results",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-separated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-separated list of percentiles for selected metrics. "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
+        "pairs, where the key is a metric name, and the value is in "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
+        "separated by spaces. Allowed request level metric names are "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    parser.add_argument(
+        "--no-structured-output",
+        action="store_true",
+        default=False,
+        help="Whether to disable JSON decoding or not.",
+    )
+    parser.add_argument(
+        "--structured-output-ratio",
+        type=float,
+        default=1.0,
+        help="Ratio of Structured Outputs requests",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
--- a/benchmark_throughput.py
+++ b/benchmark_throughput.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline inference throughput."""
+
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+
+from pathlib import Path
+import warnings
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+import uvloop
+from tqdm import tqdm
+
+from vllm.inputs import PromptType
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+
+from benchmark_dataset import (
+    AIMODataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    InstructCoderDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args,
+)
+from vllm.inputs import TextPrompt, TokensPrompt
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    num_iters_warmup: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, Optional[list[RequestOutput]]]:
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " prompt_len and expected_output_len for all requests."
+    )
+    # Add the requests to the engine.
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TokensPrompt(
+                prompt_token_ids=request.prompt["prompt_token_ids"],
+                multi_modal_data=request.multi_modal_data,
+            )
+            if "prompt_token_ids" in request.prompt
+            else TextPrompt(
+                prompt=request.prompt, multi_modal_data=request.multi_modal_data
+            )
+        )
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    # warmup
+    warmup_sampling_params = SamplingParams(
+        n=n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=10,
+    )
+    dummy_prompt_token_ids = np.random.randint(10000, size=(1,10))
+    dummy_prompts: list[PromptType] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+    
+    use_beam_search = False
+    
+    print("Warming up...")
+    for _ in tqdm(range(num_iters_warmup), desc="Warmup iterations"):
+        if not use_beam_search:
+            llm.generate(dummy_prompts, sampling_params=warmup_sampling_params, use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    outputs = None
+    if not use_beam_search:
+        if args.profile:
+            profile_dir = args.profile_result_dir
+            if not profile_dir:
+                profile_dir = Path(
+                    "."
+                ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            print(f"Profiling (results will be saved to '{profile_dir}')...")
+            with torch.profiler.profile(
+                        activities=[torch.profiler.ProfilerActivity.CPU,
+                                    torch.profiler.ProfilerActivity.CUDA,
+                        ],record_shapes=True,
+                        on_trace_ready=torch.profiler.tensorboard_trace_handler(str(profile_dir))
+                        ) as prof:
+                start = time.perf_counter()
+                outputs = llm.generate(
+                    prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+                )
+                end = time.perf_counter()
+            print('Prepare time report')
+            print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_time_total", row_limit=-1))
+        else:
+            start = time.perf_counter()
+            outputs = llm.generate(
+                prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+            )
+            end = time.perf_counter()
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0].expected_output_len
+        for request in requests:
+            assert request.expected_output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ),
+        )
+        end = time.perf_counter()
+    return end - start, outputs
+
+
+def run_vllm_chat(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of "
+        "prompt_len and expected_output_len for all requests."
+    )
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+        engine_args, disable_frontend_multiprocessing
+    ) as llm:
+        model_config = await llm.get_model_config()
+        assert all(
+            model_config.max_model_len
+            >= (request.prompt_len + request.expected_output_len)
+            for request in requests
+        ), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests."
+        )
+
+        # Add the requests to the engine.
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TokensPrompt(
+                    prompt_token_ids=request.prompt["prompt_token_ids"],
+                    multi_modal_data=request.multi_modal_data,
+                )
+                if "prompt_token_ids" in request.prompt
+                else TextPrompt(
+                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
+                )
+            )
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
+                )
+            )
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+    disable_detokenize: bool = False,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt = requests[i].prompt
+        prompt_len = requests[i].prompt_len
+        output_len = requests[i].expected_output_len
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            next_prompt_len = requests[i + 1].prompt_len
+            next_output_len = requests[i + 1].expected_output_len
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: list[SampleRequest],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [request.prompt for request in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        },
+    )
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset."
+        )
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = InstructCoderDataset
+            common_kwargs["dataset_split"] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
+def main(args: argparse.Namespace):
+    if args.seed is None:
+        args.seed = 0
+    print(args)
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    requests = get_requests(args, tokenizer)
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
+                )
+            )
+        else:
+            elapsed_time, request_outputs = run_vllm(
+                requests,
+                args.n,
+                args.num_iters_warmup,
+                EngineArgs.from_cli_args(args),
+                args.disable_detokenize,
+            )
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+            args.disable_detokenize,
+        )
+    elif args.backend == "mii":
+        elapsed_time = run_mii(
+            requests, args.model, args.tensor_parallel_size, args.output_len
+        )
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += (
+                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            )
+            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print(
+            "\033[91mWARNING\033[0m: Multi-modal request with "
+            f"{args.backend} backend detected. The "
+            "following metrics are not accurate because image tokens are not"
+            " counted. See vllm-project/vllm/issues/9778 for details."
+        )
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
+    print(f"Latency: {elapsed_time:.2f} s")
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+    )
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
+
+
+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2,
+        )
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print("When dataset path is not set, it will default to random dataset")
+        args.dataset_name = "random"
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+        getattr(args, "hf_subset", None) is not None
+        or getattr(args, "hf_split", None) is not None
+    ):
+        warnings.warn(
+            "--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+            stacklevel=2,
+        )
+    elif args.dataset_name == "hf":
+        if args.dataset_path in (
+            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm-chat", (
+                f"{args.dataset_path} needs to use vllm-chat as the backend."
+            )  # noqa: E501
+        elif args.dataset_path in (
+            InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            | AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm", (
+                f"{args.dataset_path} needs to use vllm as the backend."
+            )  # noqa: E501
+        else:
+            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != "random" and args.random_range_ratio is not None:
+        warnings.warn(
+            "--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+            stacklevel=2,
+        )
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if (
+        args.dataset_name not in {"random", "sonnet", None}
+        and args.prefix_len is not None
+    ):
+        warnings.warn(
+            "--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+            stacklevel=2,
+        )
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if (
+        args.backend in {"hf", "mii"}
+        and getattr(args, "quantization", None) is not None
+    ):
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError("Tokenizer must be the same as the model for MII backend.")
+
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, \
+            please use benchmark serving instead"
+        )
+
+
+def create_argument_parser():
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["vllm", "hf", "mii", "vllm-chat"],
+        default="vllm",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-iters-warmup", type=int, default=1, help="Number of iterations to run for warmup."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default=None,
+        help=('path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize the response (i.e. do not include "
+            "detokenization time in the measurement)"
+        ),
+    )
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the LoRA adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.",
+    )
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=None,
+        help=f"Number of prefix tokens to be used in RandomDataset "
+        "and SonnetDataset. For RandomDataset, the total input "
+        "length is the sum of prefix-len (default: "
+        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
+        "sampled from [input_len * (1 - range_ratio), "
+        "input_len * (1 + range_ratio)]. For SonnetDataset, "
+        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
+        "controls how much of the input is fixed lines versus "
+        "random lines, but the total input length remains approximately "
+        "input_len tokens.",
+    )
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=None,
+        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
+        "for sampling input/output length, "
+        "used only for RandomDataset. Must be in the range [0, 1) to "
+        "define a symmetric sampling range "
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+
+    # hf dtaset
+    parser.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    parser.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    validate_args(args)
+    main(args)
\ No newline at end of file
--- a/benchmark_utils.py
+++ b/benchmark_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
+) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
+                extra_info["tensor_parallel_size"]
+            )
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
--- a/client.sh
+++ b/client.sh
+
+echo "tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s), TTFT(ms),TPOT(ms),ITL(ms),P99_TTFT(ms),P99_TPOT(ms),P99_ITL(ms)" >  DeepSeek-R1-0528-W4A8-V2_tp8-nopc.csv
+pairs=( "512 512"  "1024 512" "2048 512" "4096 512" "8192 512" "16384 512")
+model_path="/module3/DeepSeek-R1-0528-W4A8-V2"
+tp=8
+data_type="W4A8"
+mkdir -p ./log/
+for pair in "${pairs[@]}"; do
+    for batch in 1 2 4 8 16 32 64 128  ; do
+        prompt_tokens=${pair%% *}
+        completion_tokens=${pair#* }
+        echo "data_type: $data_type,batch: $batch, prompt_tokens: $prompt_tokens, completion_tokens: $completion_tokens, tp: ${tp}"
+        log_path="log/vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log"
+        touch $log_path
+        # benchmark_throughput.py
+        python benchmark_serving.py \
+                --backend openai \
+                --port 20011\
+                --model ${model_path} \
+                --trust-remote-code \
+                --dataset-name random \
+                --ignore-eos \
+                --random-input-len ${prompt_tokens} \
+                --random-output-len ${completion_tokens} \
+                --num-prompts ${batch}  \
+                2>&1 | tee  $log_path
+        #metric
+        E2E_TIME=`grep "^Benchmark duration" $log_path | awk -F ' ' '{print $4}'`
+        REQ_THROUGHPUT=`grep "^Request throughput"  $log_path| awk -F ' ' '{print $4}'`
+        GEN_THROUGHPUT=`grep "^Output token"  $log_path| awk -F ' ' '{print $5}'`
+        TOTAL_THROUGHPUT=`grep "^Total Token" $log_path| awk -F ' ' '{print $5}'`
+        TTFT=`grep "^Mean TTFT"  $log_path| awk -F ' ' '{print $4}'`
+        TPOT=`grep "^Mean TPOT"  $log_path| awk -F ' ' '{print $4}'`
+        ITL=`grep "^Mean ITL"  $log_path| awk -F ' ' '{print $4}'`
+        P99_ITL=`grep "^P99 ITL"  $log_path| awk -F ' ' '{print $4}'`
+        P99_TTFT=`grep "^P99 TTFT"  $log_path| awk -F ' ' '{print $4}'`
+        P99_TPOT=`grep "^P99 TPOT"  $log_path| awk -F ' ' '{print $4}'`
+        echo "$tp,$data_type,$batch,$prompt_tokens,$completion_tokens,$TOTAL_THROUGHPUT,$GEN_THROUGHPUT,$TTFT,$TPOT, $ITL,$P99_TTFT,$P99_TPOT,$P99_ITL" >>  DeepSeek-R1-0528-W4A8-V2_tp8-nopc.csv
+    done
+done
\ No newline at end of file
--- a/disagg_proxy_p2p_nccl_xpyd.py
+++ b/disagg_proxy_p2p_nccl_xpyd.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import socket
+import threading
+import uuid
+
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+
+count = 0
+prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, str] = {}  # http_address: zmq_address
+
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+
+
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    prefill_instances[data["http_address"]] = data["zmq_address"]
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    decode_instances[data["http_address"]] = data["zmq_address"]
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+
+
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=[poller, router_socket], daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+app = Quart(__name__)
+
+
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+
+
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
+
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+
+        global count
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            prefill_list = list(prefill_instances.items())
+            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            decode_list = list(decode_instances.items())
+            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+
+        print(
+            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
+            f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
+            f"ZMQ:{decode_zmq_addr}]"
+        )
+        count += 1
+
+        request_id = (
+            f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
+            f"{decode_zmq_addr}_{random_uuid()}"
+        )
+
+        # finish prefill
+        async for _ in forward_request(
+            f"http://{prefill_addr}/v1/completions", prefill_request, request_id
+        ):
+            continue
+
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}/v1/completions", original_request_data, request_id
+        )
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+
+    except Exception as e:
+        import sys
+        import traceback
+
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
+
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30007)
+    app.run(host="0.0.0.0", port=10007)
+    t.join()