Initial commit

1712ebff · jerrrrry · 1712ebff · 1712ebff · 1712ebff · 1712ebff
Commit 1712ebff authored Feb 06, 2026 by jerrrrry
18 changed files
--- a/1d.sh
+++ b/1d.sh
+export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')  #ip地址
+export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false
+export VLLM_FUSED_MOE_CHUNK_SIZE=8192
+export VLLM_USE_GLOBAL_CACHE13=1 #不影响性能，帮助解决oom
+export VLLM_ENABLE_TBO=0 #defalut
+export VLLM_ZERO_OVERHEAD=1
+current_time=$(date +"%Y%m%d-%H%M")
+#export VLLM_P2P_ASYNC=1
+vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \
+--host 0.0.0.0   \
+--port 20009 \
+--trust-remote-code \
+--dtype bfloat16 \
+-q slimquant_w4a8_marlin \
+--kv-cache-dtype fp8_e5m2 \
+--max-model-len 49152 \
+-tp 8 \
+--gpu-memory-utilization 0.95 \
+--max-num-seqs 256 \
+--block-size 64 \
+--disable-log-requests  \
+--max-num-batched-tokens 8192 \
+--no-enable-prefix-caching \
+--enable-chunked-prefill \
+--speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \
+--kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"1e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20009","send_type":"PUT_ASYNC","mem_pool_size_gb":256}}' \
+2>&1 | tee 1d_log/2d-${current_time}.log
--- a/1p.sh
+++ b/1p.sh
+export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')  #ip地址
+#export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof
+export VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD=1
+export USE_FUSED_RMS_QUANT=1 #启用rmsnorm+quant融合，其它优化默认已打开
+export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false
+export VLLM_SPEC_DECODE_EAGER=1
+export VLLM_USE_GLOBAL_CACHE13=1
+export VLLM_FUSED_MOE_CHUNK_SIZE=8192
+export SENDRECV_STREAM_WITH_COMPUTE=1
+export VLLM_ENABLE_TBO=0
+export VLLM_REJECT_SAMPLE_OPT=1
+export VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT=1
+export VLLM_ZERO_OVERHEAD=1
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+current_time=$(date +"%Y%m%d-%H%M")
+#-pp 2 -tp 4 \
+#--enable-expert-parallel
+vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \
+--port 20011 \
+--trust-remote-code  \
+--dtype bfloat16 \
+-q slimquant_w4a8_marlin \
+--kv-cache-dtype fp8_e5m2 \
+--max-model-len 49152 \
+--max-num-batched-tokens 8192 \
+-tp 8 \
+--gpu-memory-utilization 0.93 \
+--max-num-seqs 512 \
+--disable-log-requests \
+--block-size 64 \
+--enforce-eager \
+--no-enable-prefix-caching \
+--enable-chunked-prefill \
+--speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \
+#--kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20011","send_type":"PUT_ASYNC"}}' \
+2>&1 | tee 1p_log/1p-${current_time}.log
--- a/__pycache__/backend_request_func.cpython-310.pyc
+++ b/__pycache__/backend_request_func.cpython-310.pyc
--- a/__pycache__/benchmark_dataset.cpython-310.pyc
+++ b/__pycache__/benchmark_dataset.cpython-310.pyc
--- a/__pycache__/benchmark_utils.cpython-310.pyc
+++ b/__pycache__/benchmark_utils.cpython-310.pyc
--- a/backend_request_func.py
+++ b/backend_request_func.py
--- a/benchmark_dataset.py
+++ b/benchmark_dataset.py
--- a/benchmark_guided.py
+++ b/benchmark_guided.py
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark guided decoding throughput."""
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from typing import List
+import datasets
+import pandas as pd
+import uvloop
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args)
+from vllm.sampling_params import GuidedDecodingParams
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+    Attributes:
+        prompt: The input text prompt for the model.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    schema: dict
+    structure_type: str = 'json'
+    completion: str = None
+def run_vllm(requests: List[SampleRequest],
+             engine_args: EngineArgs,
+             n: int,
+             guided_decoding_rate: float = 1.0,
+             warmup: bool = False) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**vars(engine_args))
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    # create a list containing random selected true or false
+    guided_decoding_req_idx = random.sample(
+        range(len(requests)), int(len(requests) * guided_decoding_rate))
+    if warmup:
+        print(">>>>> Running warmup prompt, for the first 5")
+        # We setup the first 5 requests to warmup FSM
+        # if using xgrammar dataset, we will skip warmup
+        warmup_requests = requests[:5]
+        for i, request in enumerate(warmup_requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if guided_decoding_rate > 0 else None,
+                ))
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+    print(">>>>> Benchmark started...")
+    prompts = []
+    sampling_params = []
+    for i, request in enumerate(requests):
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                guided_decoding=GuidedDecodingParams(
+                    **{request.structure_type: request.schema})
+                if i in guided_decoding_req_idx else None,
+            ))
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
+    ret = []
+    for output, request in zip(outputs, requests):
+        generated_text = output.outputs[0].text
+        ret.append({
+            "generated": generated_text,
+            "expected": request.completion
+        })
+    end = time.perf_counter()
+    return end - start, ret
+async def run_vllm_async(
+        requests: List[SampleRequest],
+        engine_args: AsyncEngineArgs,
+        n: int,
+        guided_decoding_rate: float = 1.0,
+        warmup: bool = False,
+        disable_frontend_multiprocessing: bool = False) -> float:
+    from vllm import SamplingParams
+    async with build_async_engine_client_from_engine_args(
+            engine_args, disable_frontend_multiprocessing) as llm:
+        # Add the requests to the engine.
+        prompts: List[str] = []
+        sampling_params: List[SamplingParams] = []
+        guided_decoding_req_idx = random.sample(
+            range(len(requests)), int(len(requests) * guided_decoding_rate))
+        if warmup:
+            print(">>>>>> Running warmup prompt, for the first 5")
+            # We setup the first 5 requests to warmup FSM
+            # if using xgrammar dataset, we will skip warmup
+            warmup_requests = requests[:5]
+            for i, request in enumerate(warmup_requests):
+                prompts.append(request.prompt)
+                sampling_params.append(
+                    SamplingParams(
+                        n=n,
+                        temperature=1.0,
+                        top_p=1.0,
+                        ignore_eos=True,
+                        max_tokens=request.expected_output_len,
+                        guided_decoding=GuidedDecodingParams(
+                            json=request.schema)
+                        if guided_decoding_rate > 0 else None,
+                    ))
+            generators = []
+            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+                generator = llm.generate(prompt, sp, request_id=f"test{i}")
+                generators.append(generator)
+            all_gens = merge_async_iterators(*generators)
+            async for i, res in all_gens:
+                pass
+        print(">>>>> Benchmark started...")
+        prompts = []
+        sampling_params = []
+        for i, request in enumerate(requests):
+            prompts.append(request.prompt)
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.expected_output_len,
+                    guided_decoding=GuidedDecodingParams(json=request.schema)
+                    if i in guided_decoding_req_idx else None,
+                ))
+        generators = []
+        start_time = []
+        latencies = []
+        start = time.perf_counter()
+        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+            generators.append(generator)
+            start_time.append(time.perf_counter())
+            latencies.append([])
+        all_gens = merge_async_iterators(*generators)
+        generated_texts = [''] * len(requests)
+        async for i, res in all_gens:
+            generated_texts[i] = res.outputs[0].text
+            lat = time.perf_counter() - start_time[i]
+            latencies[i].append(lat)
+        ret = [{
+            'generated': gt,
+            'expected': req.completion
+        } for gt, req in zip(generated_texts, requests)]
+        end = time.perf_counter()
+        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
+        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
+                                  for lat in latencies])
+        return end - start, ret, (first_latency, next_latency)
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    if args.dataset == 'json':
+        if args.json_schema_path is None:
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            args.json_schema_path = os.path.join(dir_path,
+                                                 "structured_schemas",
+                                                 "structured_schema_1.json")
+        with open(args.json_schema_path) as f:
+            schema = json.load(f)
+        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+    elif args.dataset == "grammar":
+        schema = """
+            ?start: select_statement
+            ?select_statement: "SELECT " column_list " FROM " table_name
+            ?column_list: column_name ("," column_name)*
+            ?table_name: identifier
+            ?column_name: identifier
+            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        """
+        prompt = "Generate an SQL query to show the 'username' \
+            and 'email' from the 'users' table."
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=schema,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+    elif args.dataset == "regex":
+        regex = r"\w+@\w+\.com\n"
+        args.regex = regex
+        prompt = "Generate an email address for Alan Turing, \
+            who works in Enigma. End in .com and new line. \
+                Example result: alan.turing@enigma.com\n"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=regex,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+    elif args.dataset == "choice":
+        choice = ["Positive", "Negative"]
+        args.choice = choice
+        prompt = "Classify this sentiment: vLLM is wonderful!"
+        input_len = len(tokenizer(prompt).input_ids)
+        print(f"Input length of the prompt: {input_len} tokens")
+        requests = [
+            SampleRequest(prompt=prompt,
+                          prompt_len=input_len,
+                          expected_output_len=args.output_len,
+                          schema=choice,
+                          structure_type=args.structure_type)
+            for _ in range(args.num_prompts)
+        ]
+    elif args.dataset == "xgrammar_bench":
+        args.warmup = False
+        requests: List[SampleRequest] = []
+        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
+                                        split="train")
+        print(f"dataset has {len(dataset)} entries")
+        len_dataset = len(dataset)
+        for data_point_idx in range(args.num_prompts):
+            idx = data_point_idx
+            while idx >= len_dataset:
+                idx -= len_dataset
+            schema = dataset["schema"][idx]
+            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
+                                                   tokenize=False)
+            input_len = len(tokenizer(prompt).input_ids)
+            completion = dataset["completion"][idx]
+            requests.append(
+                SampleRequest(prompt=prompt,
+                              prompt_len=input_len,
+                              expected_output_len=args.output_len,
+                              schema=schema,
+                              completion=completion))
+    return requests
+def evaluate(ret, args):
+    def _eval_correctness_json(expected, actual):
+        # extract json string from string using regex
+        import re
+        actual = actual.replace('\n', '').replace(' ', '').strip()
+        try:
+            actual = re.search(r'\{.*\}', actual).group()
+            actual = json.loads(actual)
+        except Exception:
+            return False
+        return True
+    def _eval_correctness_choice(expected, actual):
+        return actual in args.choice
+    def _eval_correctness_regex(expected, actual):
+        import re
+        return re.match(args.regex, actual) is not None
+    def _eval_correctness(expected, actual):
+        if args.structure_type == 'json':
+            return _eval_correctness_json(expected, actual)
+        elif args.structure_type == 'regex':
+            return _eval_correctness_regex(expected, actual)
+        elif args.structure_type == 'choice':
+            return _eval_correctness_choice(expected, actual)
+        else:
+            return None
+    scores = []
+    for res in ret:
+        score = _eval_correctness(res['expected'], res['generated'])
+        res['correctness'] = score
+        scores.append(score)
+    not_none_scores = [score for score in scores if score is not None]
+    return (sum(not_none_scores) / len(not_none_scores) *
+            100) if len(not_none_scores) > 0 else None
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    # async engine is working for 'regex', 'choice' and 'grammar'
+    if args.dataset == 'grammar':
+        args.structure_type = 'grammar'
+        args.async_engine = False
+    elif args.dataset == 'regex':
+        args.structure_type = 'regex'
+        args.async_engine = False
+    elif args.dataset == 'choice':
+        args.structure_type = 'choice'
+        args.async_engine = False
+    else:
+        args.structure_type = 'json'
+    if args.no_guided_decoding:
+        args.guided_decoding_ratio = 0
+    if args.save_results:
+        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name += f"_{args.model.split('/')[-1]}"
+        result_file_name += f"_{args.dataset}"
+        result_file_name += f"_{args.num_prompts}"
+        result_file_name += f"_out{args.output_len}"
+        result_file_name += f"_async{args.async_engine}"
+        result_file_name += f"_warmup{args.warmup}"
+        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
+        result_file_name += ".txt"
+    else:
+        result_file_name = None
+    # Synthesize a prompt with the given input length.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    requests = sample_requests(tokenizer, args)
+    if args.async_engine:
+        engine_args = AsyncEngineArgs.from_cli_args(args)
+        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
+            run_vllm_async(requests, engine_args, args.n,
+                           args.guided_decoding_ratio, args.warmup,
+                           args.disable_frontend_multiprocessing))
+    else:
+        engine_args = EngineArgs.from_cli_args(args)
+        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
+                                     args.guided_decoding_ratio, args.warmup)
+        first_latency, next_latency = None, None
+    score = evaluate(ret, args)
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    if first_latency is not None:
+        latency_breakdown = "\nFirst token latency(msecs):\n"
+        latency_breakdown += f"{first_latency.describe()}"
+        latency_breakdown += "\nNext token latency(msecs):\n"
+        latency_breakdown += f"{next_latency.describe()}"
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
+        f"Correct rate is {score} %",
+        f"{latency_breakdown if first_latency is not None else ''}")
+    # Output JSON results if specified
+    if args.output_json or result_file_name:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "total_output_tokens": total_output_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
+            "output_tokens_per_second":
+            f"{total_output_tokens / elapsed_time:.2f}",
+            "correct_rate(%)": score
+        }
+        results = {"outputs": ret, **results}
+        if first_latency is not None:
+            results["first_token_latency(msecs)"] = first_latency.describe(
+            ).to_dict()
+            results["next_token_latency(msecs)"] = next_latency.describe(
+            ).to_dict()
+        if args.output_json:
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+        elif result_file_name:
+            with open(result_file_name, "w") as f:
+                json.dump(results, f, indent=4)
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=512,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument(
+        "--dataset",
+        default='json',
+        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--json_schema_path",
+                        type=str,
+                        default=None,
+                        help="Path to json schema.")
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=10,
+                        help="Number of prompts to process.")
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument("--async-engine",
+                        action='store_true',
+                        default=False,
+                        help="Use vLLM async engine rather than LLM class.")
+    parser.add_argument("--no-guided-decoding",
+                        action='store_true',
+                        default=False,
+                        help="Whether to disable JSON decoding or not.")
+    parser.add_argument("--guided-decoding-ratio",
+                        type=float,
+                        default=1.0,
+                        help="Ratio of Guided Decoding requests")
+    parser.add_argument("--disable-frontend-multiprocessing",
+                        action='store_true',
+                        default=False,
+                        help="Disable decoupled async engine frontend.")
+    parser.add_argument("--warmup",
+                        action="store_true",
+                        default=False,
+                        help="Run warmup prompts before benchmark.")
+    parser.add_argument("--save-results",
+                        action="store_true",
+                        default=False,
+                        help="save output results.")
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    main(args)
\ No newline at end of file
--- a/benchmark_latency.py
+++ b/benchmark_latency.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark the latency of processing a single batch of requests."""
+import argparse
+import dataclasses
+import json
+import os
+import time
+from typing import Any, Optional
+import numpy as np
+from tqdm import tqdm
+import vllm.envs as envs
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
+    )
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+def main(args: argparse.Namespace):
+    print(args)
+    engine_args = EngineArgs.from_cli_args(args)
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len + args.output_len
+    ), (
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            llm.start_profile()
+            llm_generate()
+            llm.stop_profile()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+    if args.profile:
+        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        run_to_completion(profile_dir=profile_dir)
+        return
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
+    return parser
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler."
+        )
+    main(args)
--- a/benchmark_long_document_qa_throughput.py
+++ b/benchmark_long_document_qa_throughput.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Offline benchmark to test the long document QA throughput.
+Example usage:
+    # This workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+import dataclasses
+import random
+import time
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+    Returns:
+        A list of repeated prompts in the specified order.
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == "random":
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == "tile":
+        return prompts * repeat_count
+    elif mode == "interleave":
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(
+            f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
+        )
+def main(args):
+    random.seed(args.shuffle_seed)
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+    warmup_prompts = [
+        "This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance with or "
+        "without automatic prefix caching."
+    )
+    parser.add_argument(
+        "--document-length",
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )
+    parser.add_argument(
+        "--num-documents",
+        type=int,
+        default=8,
+        help="Range of input lengths for sampling prompts, "
+        'specified as "min:max" (e.g., "128:256").',
+    )
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=2,
+        help="Number of times to repeat each prompt",
+    )
+    parser.add_argument(
+        "--repeat-mode",
+        type=str,
+        default="random",
+        help="The mode to repeat prompts. The supported "
+        'modes are "random", "tile", and "interleave". '
+        "See repeat_prompts() in the source code for details.",
+    )
+    parser.add_argument(
+        "--shuffle-seed",
+        type=int,
+        default=0,
+        help='Random seed when the repeat mode is "random"',
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
--- a/benchmark_prefix_caching.py
+++ b/benchmark_prefix_caching.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark the efficiency of prefix caching.
+This script allows you to benchmark the performance of
+a model with and without prefix caching using either fixed prompts
+or prompts sampled from the ShareGPT dataset.
+Fixed example usage:
+    python benchmark_prefix_caching.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-prompts 1 \
+        --repeat-count 100 \
+        --input-length-range 128:256
+ShareGPT example usage:
+    # This command samples 20 prompts with input lengths
+    # between 128 and 256 tokens from the ShareGPT dataset,
+    # then replicates each prompt 5 times.
+    python benchmark_prefix_caching.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json \
+        --enable-prefix-caching \
+        --num-prompts 20 \
+        --repeat-count 5 \
+        --input-length-range 128:256
+"""
+import dataclasses
+import json
+import random
+import time
+from typing import Optional
+from transformers import PreTrainedTokenizerBase
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+# import triton
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+# triton_version = triton.__version__
+# if triton_version.startswith("2.1"):
+#     from triton.common.backend import compute_core_version_key
+# elif triton_version.startswith("3.0"):
+#     from triton.compiler.compiler import triton_key
+# else:
+#     print(f"TRITON version {triton_version} is not specifically handled.")
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    # if triton_version.startswith("2.1"):
+    #     version_key = compute_core_version_key()
+    # if triton_version.startswith("3.0"):
+    #     version_key = triton_key()
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+@dataclasses.dataclass
+class Request:
+    prompt: str
+    prompt_len: int
+    output_len: int
+def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
+    vocab = tokenizer.get_vocab()
+    all_special_ids = set(tokenizer.all_special_ids)
+    # Remove the special tokens.
+    return random.choices(
+        [v for k, v in vocab.items() if k not in all_special_ids],
+        k=length,
+    )
+def sample_requests_from_dataset(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: tuple[int, int],
+    fixed_output_len: Optional[int],
+) -> list[Request]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+    min_len, max_len = input_length_range
+    assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
+    # Filter out sequences that are too long or too short
+    filtered_requests: list[Request] = []
+    for i in range(len(dataset)):
+        if len(filtered_requests) == num_requests:
+            break
+        # Tokenize the prompts and completions.
+        prompt_token_ids = tokenizer(dataset[i][0]).input_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if min_len <= prompt_len <= max_len:
+            filtered_requests.append(Request(prompt, prompt_len, output_len))
+    return filtered_requests
+def sample_requests_from_random(
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    input_length_range: tuple[int, int],
+    fixed_output_len: Optional[int],
+    prefix_len: int,
+) -> list[Request]:
+    requests = []
+    prefix_token_ids = sample_tokens(tokenizer, prefix_len)
+    min_len, max_len = input_length_range
+    for i in range(num_requests):
+        unique_part_token_ids = sample_tokens(
+            tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
+        )
+        prompt_token_ids = prefix_token_ids + unique_part_token_ids
+        prompt = tokenizer.decode(prompt_token_ids)
+        prompt_len = len(prompt_token_ids)
+        assert min_len <= prompt_len <= max_len, (
+            f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
+        )
+        requests.append(Request(prompt, prompt_len, fixed_output_len))
+    return requests
+def repeat_and_sort_requests(
+    requests: list[Request], repeat_count: int, sort: bool = False
+) -> list[str]:
+    repeated_requests = requests * repeat_count
+    if sort:
+        repeated_requests.sort(key=lambda x: x[1])
+    else:
+        random.shuffle(repeated_requests)
+    return [req.prompt for req in repeated_requests]
+def main(args):
+    tokenizer = get_tokenizer(args.model, trust_remote_code=True)
+    input_length_range = tuple(map(int, args.input_length_range.split(":")))
+    random.seed(args.seed)
+    if args.dataset_path is not None:
+        if args.prefix_len > 0:
+            raise ValueError(
+                "prefix-len is not supported when dataset-path is provided."
+            )
+        print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
+        filtered_requests = sample_requests_from_dataset(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+        )
+    else:
+        print(f"Start to sample {args.num_prompts} prompts from random")
+        filtered_requests = sample_requests_from_random(
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            input_length_range=input_length_range,
+            fixed_output_len=args.output_len,
+            prefix_len=args.prefix_len,
+        )
+    # Print some helpful stats of the requests.
+    print(f"Sampled {len(filtered_requests)} requests.")
+    prompt_lens = [req.prompt_len for req in filtered_requests]
+    print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}")
+    print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}")
+    print(f"Min Prompt Length: {min(prompt_lens)}")
+    print(f"Max Prompt Length: {max(prompt_lens)}")
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
+    )
+    print("Testing filtered requests")
+    prompts = repeat_and_sort_requests(
+        filtered_requests, repeat_count=args.repeat_count, sort=args.sort
+    )
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+def create_argument_parser():
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance with or without "
+        "automatic prefix caching."
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        required=True,
+        help="Number of the prompts sampled from dataset",
+    )
+    parser.add_argument(
+        "--repeat-count",
+        type=int,
+        default=1,
+        help="Number of times to repeat each prompt",
+    )
+    parser.add_argument(
+        "--sort", action="store_true", help="Sort prompts by input length"
+    )
+    parser.add_argument(
+        "--input-length-range",
+        type=str,
+        required=True,
+        help="Range of input lengths for sampling prompts,"
+        'specified as "min:max" (e.g., "128:256").',
+    )
+    parser.add_argument(
+        "--prefix-len",
+        type=int,
+        default=0,
+        help="Specifies the length of a common prefix to be "
+        "added to the input prompt. The input-length-range will "
+        "subtract this length when filtering prompts. Only used "
+        "when dataset-path is not provided.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    main(args)
--- a/benchmark_prioritization.py
+++ b/benchmark_prioritization.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Benchmark offline prioritization."""
+import argparse
+import dataclasses
+import json
+import random
+import time
+from typing import Optional
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+# Select a equi-probable random priority
+def get_random_flag():
+    return 0 if random.random() < 0.5 else 1
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> list[tuple[str, int, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+    # Filter out sequences that are too long or too short
+    filtered_dataset: list[tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        priority = get_random_flag()
+        filtered_dataset.append((prompt, prompt_len, output_len, priority))
+    return filtered_dataset
+def run_vllm(
+    requests: list[tuple[str, int, int]],
+    n: int,
+    engine_args: EngineArgs,
+    disable_detokenize: bool = False,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " input_len and output_len for all requests."
+    )
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    priority = []
+    for prompt, _, output_len, _priority in requests:
+        prompts.append(prompt)
+        priority.append(_priority)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=output_len,
+                detokenize=not disable_detokenize,
+            )
+        )
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [
+            (prompt, args.input_len, args.output_len, get_random_flag())
+            for _ in range(args.num_prompts)
+        ]
+    else:
+        requests = sample_requests(
+            args.dataset, args.num_prompts, tokenizer, args.output_len
+        )
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(
+        prompt_len + output_len for _, prompt_len, output_len, priority in requests
+    )
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} tokens/s"
+    )
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+def create_argument_parser():
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=200, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    return parser
+if __name__ == "__main__":
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+    main(args)
\ No newline at end of file
--- a/benchmark_serving.py
+++ b/benchmark_serving.py
--- a/benchmark_serving_structured_output.py
+++ b/benchmark_serving_structured_output.py
--- a/benchmark_throughput.py
+++ b/benchmark_throughput.py
--- a/benchmark_utils.py
+++ b/benchmark_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+import math
+import os
+from typing import Any
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
+) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
+                extra_info["tensor_parallel_size"]
+            )
+        records.append(record)
+    return records
+class InfEncoder(json.JSONEncoder):
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(
+            records,
+            f,
+            cls=InfEncoder,
+            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
+        )
--- a/client.sh
+++ b/client.sh
+echo "tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s), TTFT(ms),TPOT(ms),ITL(ms),P99_TTFT(ms),P99_TPOT(ms),P99_ITL(ms)" >  DeepSeek-R1-0528-W4A8-V2_tp8-nopc.csv
+pairs=( "512 512"  "1024 512" "2048 512" "4096 512" "8192 512" "16384 512")
+model_path="/module3/DeepSeek-R1-0528-W4A8-V2"
+tp=8
+data_type="W4A8"
+mkdir -p ./log/
+for pair in "${pairs[@]}"; do
+    for batch in 1 2 4 8 16 32 64 128  ; do
+        prompt_tokens=${pair%% *}
+        completion_tokens=${pair#* }
+        echo "data_type: $data_type,batch: $batch, prompt_tokens: $prompt_tokens, completion_tokens: $completion_tokens, tp: ${tp}"
+        log_path="log/vllm_${model}_batch_${batch}_prompt_tokens_${prompt_tokens}_completion_tokens_${completion_tokens}_tp_${tp}.log"
+        touch $log_path
+        # benchmark_throughput.py
+        python benchmark_serving.py \
+                --backend openai \
+                --port 20011\
+                --model ${model_path} \
+                --trust-remote-code \
+                --dataset-name random \
+                --ignore-eos \
+                --random-input-len ${prompt_tokens} \
+                --random-output-len ${completion_tokens} \
+                --num-prompts ${batch}  \
+                2>&1 | tee  $log_path
+        #metric
+        E2E_TIME=`grep "^Benchmark duration" $log_path | awk -F ' ' '{print $4}'`
+        REQ_THROUGHPUT=`grep "^Request throughput"  $log_path| awk -F ' ' '{print $4}'`
+        GEN_THROUGHPUT=`grep "^Output token"  $log_path| awk -F ' ' '{print $5}'`
+        TOTAL_THROUGHPUT=`grep "^Total Token" $log_path| awk -F ' ' '{print $5}'`
+        TTFT=`grep "^Mean TTFT"  $log_path| awk -F ' ' '{print $4}'`
+        TPOT=`grep "^Mean TPOT"  $log_path| awk -F ' ' '{print $4}'`
+        ITL=`grep "^Mean ITL"  $log_path| awk -F ' ' '{print $4}'`
+        P99_ITL=`grep "^P99 ITL"  $log_path| awk -F ' ' '{print $4}'`
+        P99_TTFT=`grep "^P99 TTFT"  $log_path| awk -F ' ' '{print $4}'`
+        P99_TPOT=`grep "^P99 TPOT"  $log_path| awk -F ' ' '{print $4}'`
+        echo "$tp,$data_type,$batch,$prompt_tokens,$completion_tokens,$TOTAL_THROUGHPUT,$GEN_THROUGHPUT,$TTFT,$TPOT, $ITL,$P99_TTFT,$P99_TPOT,$P99_ITL" >>  DeepSeek-R1-0528-W4A8-V2_tp8-nopc.csv
+    done
+done
\ No newline at end of file
--- a/disagg_proxy_p2p_nccl_xpyd.py
+++ b/disagg_proxy_p2p_nccl_xpyd.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import socket
+import threading
+import uuid
+import aiohttp
+import msgpack
+import zmq
+from quart import Quart, make_response, request
+count = 0
+prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, str] = {}  # http_address: zmq_address
+prefill_cv = threading.Condition()
+decode_cv = threading.Condition()
+def _listen_for_register(poller, router_socket):
+    while True:
+        socks = dict(poller.poll())
+        if router_socket in socks:
+            remote_address, message = router_socket.recv_multipart()
+            # data: {"type": "P", "http_address": "ip:port",
+            #        "zmq_address": "ip:port"}
+            data = msgpack.loads(message)
+            if data["type"] == "P":
+                global prefill_instances
+                global prefill_cv
+                with prefill_cv:
+                    prefill_instances[data["http_address"]] = data["zmq_address"]
+            elif data["type"] == "D":
+                global decode_instances
+                global decode_cv
+                with decode_cv:
+                    decode_instances[data["http_address"]] = data["zmq_address"]
+            else:
+                print(
+                    "Unexpected, Received message from %s, data: %s",
+                    remote_address,
+                    data,
+                )
+def start_service_discovery(hostname, port):
+    if not hostname:
+        hostname = socket.gethostname()
+    if port == 0:
+        raise ValueError("Port cannot be 0")
+    context = zmq.Context()
+    router_socket = context.socket(zmq.ROUTER)
+    router_socket.bind(f"tcp://{hostname}:{port}")
+    poller = zmq.Poller()
+    poller.register(router_socket, zmq.POLLIN)
+    _listener_thread = threading.Thread(
+        target=_listen_for_register, args=[poller, router_socket], daemon=True
+    )
+    _listener_thread.start()
+    return _listener_thread
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+app = Quart(__name__)
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
+async def forward_request(url, data, request_id):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "X-Request-Id": request_id,
+        }
+        async with session.post(url=url, json=data, headers=headers) as response:
+            if response.status == 200:
+                if True:
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+@app.route("/v1/completions", methods=["POST"])
+async def handle_request():
+    try:
+        original_request_data = await request.get_json()
+        prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
+        prefill_request["max_tokens"] = 1
+        global count
+        global prefill_instances
+        global prefill_cv
+        with prefill_cv:
+            prefill_list = list(prefill_instances.items())
+            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+        global decode_instances
+        global decode_cv
+        with decode_cv:
+            decode_list = list(decode_instances.items())
+            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+        print(
+            f"handle_request count: {count}, [HTTP:{prefill_addr}, "
+            f"ZMQ:{prefill_zmq_addr}] 👉 [HTTP:{decode_addr}, "
+            f"ZMQ:{decode_zmq_addr}]"
+        )
+        count += 1
+        request_id = (
+            f"___prefill_addr_{prefill_zmq_addr}___decode_addr_"
+            f"{decode_zmq_addr}_{random_uuid()}"
+        )
+        # finish prefill
+        async for _ in forward_request(
+            f"http://{prefill_addr}/v1/completions", prefill_request, request_id
+        ):
+            continue
+        # return decode
+        generator = forward_request(
+            f"http://{decode_addr}/v1/completions", original_request_data, request_id
+        )
+        response = await make_response(generator)
+        response.timeout = None
+        return response
+    except Exception as e:
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+if __name__ == "__main__":
+    t = start_service_discovery("0.0.0.0", 30007)
+    app.run(host="0.0.0.0", port=10007)
+    t.join()