sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

118f1fc7 · maxiao1 · 118f1fc7 · 118f1fc7 · 118f1fc7 · 118f1fc7
Commit 118f1fc7 authored Sep 13, 2025 by maxiao1
20 changed files
--- a/benchmark/hicache/bench_mix.py
+++ b/benchmark/hicache/bench_mix.py
+import argparse
+import asyncio
+import json
+import logging
+import os
+import queue
+import random
+import threading
+import time
+from dataclasses import dataclass
+from functools import wraps
+
+import aiohttp
+
+from sglang.bench_serving import (
+    RequestFuncOutput,
+    get_tokenizer,
+    remove_prefix,
+    sample_random_requests,
+)
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+# Set up JSONL file for debug logging
+debug_log_file = None
+# Create a lock for thread-safe debug log writing
+debug_log_lock = threading.Lock()
+
+
+def write_debug_log(data):
+    global debug_log_file
+
+    """Write debug information to a JSONL file"""
+    if debug_log_file is None:
+        return
+
+    # Acquire lock for thread-safe writing
+    with debug_log_lock:
+        # Write as JSONL (JSON Line format)
+        debug_log_file.write(json.dumps(data) + "\n")
+        debug_log_file.flush()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Script to benchmark concurrent requests to a server."
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="/data/models/Qwen3-0.6B",
+        help="model path compatible with Hugging Face Transformers",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default="/data/models/ShareGPT_V3_unfiltered_cleaned_split/ShareGPT_V3_unfiltered_cleaned_split.json",
+        help="local dataset to sample tokens from",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server hostname or IP (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="Server port (default: 30000)",
+    )
+    parser.add_argument(
+        "--duration",
+        type=int,
+        default=600,
+        help="Duration to run the benchmark in seconds (default: 300 seconds)",
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default="info",
+        choices=["debug", "info"],
+        help="Set the logging level (default: info)",
+    )
+    parser.add_argument(
+        "--debug-log-file",
+        type=str,
+        default="debug.log.jsonl",
+        help="File to write debug logs in JSONL format",
+    )
+    return parser.parse_args()
+
+
+def load_config():
+    config_path = os.getenv("CONFIG_PATH")
+    if not config_path:
+        raise ValueError("Environment variable 'CONFIG_PATH' is not set.")
+
+    with open(config_path, "r") as f:
+        config = json.load(f)
+
+    required_keys = [
+        "num_rounds",
+        "num_clients",
+        "round_ratios",
+        "mean_new_tokens_per_round",
+        "mean_return_tokens_per_round",
+        "mean_inter_round_interval",
+    ]
+
+    for key in required_keys:
+        if key not in config:
+            raise KeyError(f"Missing required configuration key: {key}")
+
+    num_rounds = config["num_rounds"]
+    assert len(config["round_ratios"]) == num_rounds
+    assert len(config["mean_new_tokens_per_round"]) == num_rounds
+    assert len(config["mean_return_tokens_per_round"]) == num_rounds
+    assert len(config["mean_inter_round_interval"]) == num_rounds
+
+    print(config)
+
+    return config
+
+
+@dataclass
+class UserData:
+    user_id: int
+    current_round: int
+    total_rounds: int
+    prompt: str
+    return_tokens: int
+    start: int
+
+
+def synchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.lock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+class UserGenerator:
+    def __init__(self, config, model_path, dataset_path):
+        self.tokenizer_path = model_path
+        self.tokenizer = get_tokenizer(self.tokenizer_path)
+        self.dataset_path = dataset_path
+
+        self.user_id = 0
+        self.lock = threading.Lock()
+
+        self.num_rounds = config["num_rounds"]
+
+        self.cumulative_ratios = [
+            sum(config["round_ratios"][: i + 1])
+            for i in range(len(config["round_ratios"]))
+        ]
+        self.mean_new_tokens_per_round = config["mean_new_tokens_per_round"]
+        self.mean_return_tokens_per_round = config["mean_return_tokens_per_round"]
+        self.mean_inter_round_interval = config["mean_inter_round_interval"]
+
+        self.sigma = 100
+        self.range_ratio = 0.8
+        assert self.range_ratio <= 1
+
+        self.candidate_inputs = [
+            [
+                r
+                for r in sample_random_requests(
+                    input_len=(
+                        self.mean_new_tokens_per_round[i] * (2 - self.range_ratio)
+                    ),
+                    output_len=(
+                        self.mean_return_tokens_per_round[i] * (2 - self.range_ratio)
+                    ),
+                    num_prompts=config["num_clients"],
+                    range_ratio=self.range_ratio / (2 - self.range_ratio),
+                    tokenizer=self.tokenizer,
+                    dataset_path=self.dataset_path,
+                    random_sample=False,
+                )
+            ]
+            for i in range(self.num_rounds)
+        ]
+
+        self.multiturn_queue = []
+
+        self.user_stats = [0 for _ in range(self.num_rounds)]
+        self.input_stats = [[0, 0] for _ in range(self.num_rounds)]
+        self.output_stats = [[0, 0] for _ in range(self.num_rounds)]
+
+    def gen(self):
+        user_id = self.user_id
+        self.user_id += 1
+
+        rand_ratio = random.randint(0, self.cumulative_ratios[-1])
+        i = len(self.cumulative_ratios)
+        for idx, cumulative_ratio in enumerate(self.cumulative_ratios):
+            if rand_ratio >= cumulative_ratio:
+                continue
+            else:
+                i = idx + 1
+                break
+        total_rounds = i
+        current_round = 0
+
+        candidate_input = random.sample(self.candidate_inputs[current_round], 1)[0]
+        self.input_stats[0][0] += candidate_input.prompt_len
+        self.input_stats[0][1] += 1
+        prompt = f"{user_id} " + candidate_input.prompt
+        return_tokens = int(
+            random.gauss(self.mean_return_tokens_per_round[current_round], self.sigma)
+        )
+        if return_tokens <= 0:
+            return_tokens = self.mean_return_tokens_per_round[current_round]
+        start = 0
+
+        user_data = UserData(
+            user_id, current_round, total_rounds, prompt, return_tokens, start
+        )
+
+        self.user_stats[total_rounds - 1] += 1
+
+        return user_data
+
+    @synchronized()
+    def push(self, user_data, generated_text, len_itl):
+        self.output_stats[user_data.current_round][0] += len_itl + 1
+        self.output_stats[user_data.current_round][1] += 1
+        user_data.current_round += 1
+        if user_data.current_round >= user_data.total_rounds:
+            return
+
+        candidate_input = random.sample(
+            self.candidate_inputs[user_data.current_round], 1
+        )[0]
+        self.input_stats[user_data.current_round][0] += candidate_input.prompt_len
+        self.input_stats[user_data.current_round][1] += 1
+        user_data.prompt += generated_text + candidate_input.prompt
+        user_data.return_tokens = int(
+            random.gauss(
+                self.mean_return_tokens_per_round[user_data.current_round], self.sigma
+            )
+        )
+        if user_data.return_tokens <= 0:
+            user_data.return_tokens = self.mean_return_tokens_per_round[
+                user_data.current_round
+            ]
+        interval = random.gauss(
+            self.mean_inter_round_interval[user_data.current_round], self.sigma
+        )
+        if interval <= 0:
+            interval = self.mean_inter_round_interval[user_data.current_round]
+        user_data.start = time.perf_counter() + interval
+
+        if len(self.multiturn_queue) == 0:
+            self.multiturn_queue.append(user_data)
+        else:
+            i = len(self.multiturn_queue)
+            for idx, d in enumerate(self.multiturn_queue):
+                if user_data.start < d.start:
+                    i = idx
+                    break
+            self.multiturn_queue.insert(idx, user_data)
+
+    @synchronized()
+    def pop(self):
+        if (
+            len(self.multiturn_queue)
+            and time.perf_counter() > self.multiturn_queue[0].start
+        ):
+            return self.multiturn_queue.pop(0)
+        return self.gen()
+
+
+def gen_payload(prompt, output_len):
+    payload = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+        },
+        "stream": True,
+        "stream_options": {"include_usage": True},
+        "lora_path": "",
+        "return_logprob": False,
+        "logprob_start_len": -1,
+    }
+    return payload
+
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+
+async def async_request_sglang_generate(
+    user_data,
+    url,
+    atomic_counter,
+):
+    """
+    Sends a streaming request to the server. Gathers text token-by-token.
+    """
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {}
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        output = RequestFuncOutput()
+        payload = gen_payload(user_data.prompt, user_data.return_tokens)
+        write_debug_log({"timestamp": st, "user_data": user_data.__dict__})
+
+        try:
+            async with session.post(url=url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    prompt_tokens = 0
+                    cached_tokens = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            if data.get("text"):
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                    prompt_tokens = (data.get("meta_info") or {}).get(
+                                        "prompt_tokens", 0
+                                    )
+                                    cached_tokens = (data.get("meta_info") or {}).get(
+                                        "cached_tokens", 0
+                                    )
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text = data["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.prompt_len = prompt_tokens
+                    output.cached_tokens = cached_tokens
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception as e:
+            output.success = False
+            output.error = str(e)
+            print(f"Request failed: {e}")
+
+    atomic_counter.increment(1)
+    return output
+
+
+class AtomicCounter:
+    def __init__(self, initial_value=0):
+        self._value = initial_value
+        self.lock = threading.Lock()
+
+    @synchronized()
+    def increment(self, amount=1):
+        self._value += amount
+
+    @synchronized()
+    def get(self):
+        return self._value
+
+
+class WorkloadGenerator:
+    def __init__(self, args):
+        config = load_config()
+        user_generator = UserGenerator(
+            config,
+            args.model_path,
+            args.dataset_path,
+        )
+
+        self.url = f"http://{args.host}:{args.port}/generate"
+
+        self.tokenizer = user_generator.tokenizer
+        self.start_time = None
+        self.finished_time = None
+        self.duration = args.duration
+        self.done = False
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.user_generator = user_generator
+        self.response_queue = queue.Queue()
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+        }
+        self.max_parallel = config["num_clients"]
+
+        self.atomic_counter = AtomicCounter()
+
+    async def handle_request(self, user_data):
+        try:
+            response = await async_request_sglang_generate(
+                user_data, self.url, self.atomic_counter
+            )
+            self.response_queue.put((user_data, response))
+        except Exception as e:
+            print(f"Request failed: {e}")
+            self.completed_requests += 1
+
+    def request_sender(self):
+        async def request_loop():
+            while True:
+                if self.sent_requests - self.completed_requests < self.max_parallel:
+                    new_request = self.user_generator.pop()
+                    if new_request:
+                        asyncio.create_task(self.handle_request(new_request))
+                        self.sent_requests += 1
+                else:
+                    await asyncio.sleep(0.05)
+                    continue
+
+                if time.perf_counter() - self.start_time > self.duration:
+                    self.done = True
+                    break
+
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(request_loop())
+        loop.close()
+
+    def response_handler(self):
+        while True:
+            try:
+                user_data, response = self.response_queue.get(timeout=10)
+                logger.info(
+                    f"{((time.perf_counter()-self.start_time)/self.duration*100):.2f}%"
+                )
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+
+                self.user_generator.push(
+                    user_data, response.generated_text, len(response.itl)
+                )
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["latency"].append(response.latency)
+                self.performance_metrics["prompt_len"].append(response.prompt_len)
+                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
+                self.completed_requests += 1
+                self.finished_time = time.perf_counter()
+
+            except queue.Empty:
+                if self.done:
+                    break
+            except ValueError as e:
+                print(f"Error processing response for client {user_data}: {e}")
+                continue
+
+    def run(self):
+        request_thread = threading.Thread(target=self.request_sender, daemon=True)
+        response_thread = threading.Thread(target=self.response_handler, daemon=True)
+
+        self.start_time = time.perf_counter()
+        request_thread.start()
+        response_thread.start()
+
+        request_thread.join()
+        response_thread.join()
+
+        performance_data = {
+            "summary": {
+                "total_requests": len(self.performance_metrics["ttft"]),
+                "average_ttft": sum(self.performance_metrics["ttft"])
+                / len(self.performance_metrics["ttft"]),
+                "p90_ttft": sorted(self.performance_metrics["ttft"])[
+                    int(0.9 * len(self.performance_metrics["ttft"]))
+                ],
+                "median_ttft": sorted(self.performance_metrics["ttft"])[
+                    len(self.performance_metrics["ttft"]) // 2
+                ],
+                "average_latency": sum(self.performance_metrics["latency"])
+                / len(self.performance_metrics["latency"]),
+                "p90_latency": sorted(self.performance_metrics["latency"])[
+                    int(0.9 * len(self.performance_metrics["latency"]))
+                ],
+                "median_latency": sorted(self.performance_metrics["latency"])[
+                    len(self.performance_metrics["latency"]) // 2
+                ],
+                "throughput": self.atomic_counter.get()
+                / (self.finished_time - self.start_time),
+                "cache_hit_rate": (
+                    0
+                    if sum(self.performance_metrics["prompt_len"]) == 0
+                    else sum(self.performance_metrics["cached_tokens"])
+                    / sum(self.performance_metrics["prompt_len"])
+                ),
+            },
+        }
+        print("All requests completed")
+        print("Performance metrics summary:")
+        print(f"  Total requests: {performance_data['summary']['total_requests']}")
+        print(f"  Average TTFT: {performance_data['summary']['average_ttft']:.2f}")
+        print(f"  P90 TTFT: {performance_data['summary']['p90_ttft']:.2f}")
+        print(f"  Median TTFT: {performance_data['summary']['median_ttft']:.2f}")
+        print(
+            f"  Average latency: {performance_data['summary']['average_latency']:.2f}"
+        )
+        print(f"  P90 latency: {performance_data['summary']['p90_latency']:.2f}")
+        print(f"  Median latency: {performance_data['summary']['median_latency']:.2f}")
+        print(
+            f"  Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
+        )
+        print(f"  Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
+
+        user_stats = self.user_generator.user_stats
+        input_stats = self.user_generator.input_stats
+        output_stats = self.user_generator.output_stats
+        print(f"round_ratios: {user_stats}")
+        print(
+            f"mean_new_tokens_per_round: {[int(a/b) if b > 0 else 0 for a, b in input_stats]}"
+        )
+        print(
+            f"mean_return_tokens_per_round: {[int(a/b) if b > 0 else 0 for a, b in output_stats]}"
+        )
+        return performance_data
+
+
+def main():
+    global debug_log_file
+
+    args = parse_args()
+    if args.log_level == "debug":
+        logging.basicConfig(level=logging.DEBUG)
+        logger.info("use log_level debug")
+        # Initialize debug log file
+        debug_log_file = open(args.debug_log_file, "w")
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logger.info("use log_level info")
+    performance_data = WorkloadGenerator(args).run()
+
+    # Close debug log file if it was opened
+    if debug_log_file:
+        debug_log_file.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/hicache/bench_mix.sh
+++ b/benchmark/hicache/bench_mix.sh
+#!/bin/bash
+
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/torch/lib
+rm -rf nohup.out && \
+nohup python3 -m sglang.launch_server \
+    --attention-backend triton \
+    --model-path /code/models/Qwen3-32B/ \
+    --log-level info \
+    --tp 4 --mem-frac 0.25 \
+    --host 0.0.0.0 --port 33301 \
+    --enable-metrics --enable-cache-report \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2.5 --hicache-size 0 \
+    --hicache-io-backend kernel \
+    --hicache-mem-layout layer_first \
+    --hicache-write-policy write_through \
+    &
+
+##################################################
+
+export CONFIG_PATH=/tmp/bench_mix_config.json
+
+# num_clients: Maximum number of concurrent client requests to be simulated
+# round_ratios: Distribution of requests across rounds. Given sum(round_ratios) total requests,
+#               round_ratios[i] denotes the number of requests that will execute for (i+1) rounds
+echo '{
+  "num_rounds": 10,
+  "num_clients": 60,
+  "round_ratios": [50, 25, 15, 15, 10, 10, 9, 8, 7, 6],
+  "mean_new_tokens_per_round": [1000, 400, 350, 300, 280, 260, 240, 220, 210, 200],
+  "mean_return_tokens_per_round": [100, 100, 100, 100, 100, 100, 100, 100, 100, 100],
+  "mean_inter_round_interval": [30, 30, 30, 30, 30, 30, 30, 30, 30, 30]
+}' > ${CONFIG_PATH}
+
+rm -rf bench_mix.out && \
+nohup python3 /sgl-workspace/sglang/benchmark/hicache/bench_mix.py \
+    --model-path /code/models/Qwen3-32B/ \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 33301 \
+    --duration 600 \
+> bench_mix.out &
--- a/benchmark/hicache/bench_multiturn.py
+++ b/benchmark/hicache/bench_multiturn.py
+import argparse
+import asyncio
+import json
+import queue
+import random
+import threading
+import time
+from datetime import datetime
+from typing import Optional
+
+import aiohttp
+import numpy as np
+import requests
+from tqdm.asyncio import tqdm
+
+from sglang.bench_serving import (
+    RequestFuncOutput,
+    get_tokenizer,
+    remove_prefix,
+    sample_random_requests,
+)
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Script to benchmark concurrent requests to a server."
+    )
+    parser.add_argument(
+        "--num-clients",
+        type=int,
+        default=256,
+        help="Number of concurrent clients",
+    )
+    parser.add_argument(
+        "--max-parallel",
+        type=int,
+        default=128,
+        help="Maximum number of parallel requests",
+    )
+    parser.add_argument(
+        "--request-length",
+        type=int,
+        default=512,
+        help="Length of each new request",
+    )
+    parser.add_argument(
+        "--output-length",
+        type=int,
+        default=64,
+        help="Length of each output",
+    )
+    parser.add_argument(
+        "--num-rounds",
+        type=int,
+        default=5,
+        help="Number of rounds per client",
+    )
+    parser.add_argument(
+        "--distribution",
+        type=str,
+        default="poisson",
+        choices=["poisson", "uniform"],
+        help="Distribution type for request intervals (poisson or uniform)",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=1.0,
+        help="Average number of requests per second",
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server hostname or IP (default: localhost)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=30000,
+        help="Server port (default: 30000)",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="meta-llama/Llama-3.1-8B-Instruct",
+        help="model path compatible with Hugging Face Transformers",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default="",
+        help="local dataset to sample tokens from",
+    )
+    parser.add_argument(
+        "--log-file",
+        type=str,
+        default="performance_metrics.jsonl",
+        help="File to log performance metrics",
+    )
+    parser.add_argument(
+        "--disable-auto-run",
+        action="store_true",
+        help="If set, disable automatically testing with a range of request rates.",
+    )
+
+    parser.add_argument(
+        "--disable-random-sample",
+        action="store_true",
+        help="If set, disable random sampling of requests from the ShareGPT dataset.",
+    )
+    parser.add_argument(
+        "--sub-question-input-length",
+        type=int,
+        default=0,
+        help="Length of the sub question input for each request, if set 0 use request_length",
+    )
+    parser.add_argument(
+        "--ready-queue-policy",
+        type=str,
+        default="random",
+        help="Policy for popping requests from the ready queue (random or fifo)",
+    )
+    parser.add_argument(
+        "--tag",
+        type=str,
+        default="",
+        help="Tag of a certain run in the log file",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default="",
+        help="String of LoRA path. Currently we only support benchmarking on a single LoRA adaptor.",
+    )
+    return parser.parse_args()
+
+
+async def async_request_sglang_generate(
+    payload,
+    url,
+    pbar: Optional[tqdm] = None,
+):
+    """
+    Sends a streaming request to the server. Gathers text token-by-token.
+    """
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        headers = {}
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        output = RequestFuncOutput()
+
+        try:
+            async with session.post(url=url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    prompt_tokens = 0
+                    cached_tokens = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            if data["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                    prompt_tokens = (data.get("meta_info") or {}).get(
+                                        "prompt_tokens", 0
+                                    )
+                                    cached_tokens = (data.get("meta_info") or {}).get(
+                                        "cached_tokens", 0
+                                    )
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text = data["text"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.prompt_len = prompt_tokens
+                    output.cached_tokens = cached_tokens
+                    output.generated_len = len(output.itl) + 1
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception as e:
+            output.success = False
+            output.error = str(e)
+            print(f"Request failed: {e}")
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+def gen_payload(prompt, output_len, lora_path=""):
+    payload = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": 0.0,
+            "max_new_tokens": output_len,
+            "ignore_eos": True,
+        },
+        "stream": True,
+        "stream_options": {"include_usage": True},
+        "lora_path": lora_path,
+        "return_logprob": False,
+        "logprob_start_len": -1,
+    }
+    return payload
+
+
+def log_to_jsonl_file(data, file_path="performance_metrics.jsonl", tag=""):
+    """Append the data with a timestamp and tag to the specified JSONL file."""
+    timestamped_data = {"timestamp": datetime.now().isoformat(), "tag": tag, **data}
+    try:
+        with open(file_path, "a") as file:
+            file.write(
+                json.dumps(timestamped_data) + "\n"
+            )  # Write as a single line in JSONL format
+    except IOError as e:
+        print(f"Error writing to JSONL file: {e}")
+
+
+class ReadyQueue:
+    """
+    Thread-safe queue that can pop requests in different orders based on given policy.
+    """
+
+    def __init__(self, init_requests=None, policy="random"):
+        self.lock = threading.Lock()
+        self.requests = init_requests or []
+        self.policy = policy
+
+    def append(self, item):
+        with self.lock:
+            self.requests.append(item)
+
+    def pop(self):
+        with self.lock:
+            if not self.requests:
+                return None
+            if self.policy == "random":
+                index = random.randrange(len(self.requests))
+                return self.requests.pop(index)
+            elif self.policy == "fifo":
+                return self.requests.pop(0)
+            else:
+                # todo, varying thinking time of clients
+                raise ValueError(f"{self.policy} not implemented")
+
+
+class WorkloadGenerator:
+    def __init__(self, args):
+        # Construct the base URL for requests
+        self.url = f"http://{args.host}:{args.port}/generate"
+
+        self.tokenizer = get_tokenizer(args.model_path)
+        self.distribution = args.distribution
+        self.request_rate = args.request_rate
+        self.start_time = None
+        self.finished_time = None
+
+        self.sent_requests = 0
+        self.completed_requests = 0
+
+        self.candidate_inputs = sample_random_requests(
+            input_len=args.request_length,
+            output_len=args.output_length,
+            num_prompts=args.num_clients,
+            range_ratio=1.0,
+            tokenizer=self.tokenizer,
+            dataset_path=args.dataset_path,
+            random_sample=not args.disable_random_sample,
+        )
+        self.candidate_inputs = [i.prompt for i in self.candidate_inputs]
+
+        if args.sub_question_input_length != 0:
+            sub_question_input_length = args.sub_question_input_length
+        else:
+            sub_question_input_length = args.request_length
+
+        self.sub_question_inputs = sample_random_requests(
+            input_len=sub_question_input_length,
+            output_len=args.output_length,
+            num_prompts=args.num_clients * max(args.num_rounds - 1, 1),
+            range_ratio=1.0,
+            tokenizer=self.tokenizer,
+            dataset_path=args.dataset_path,
+            random_sample=not args.disable_random_sample,
+        )
+
+        init_requests = [
+            (
+                i,
+                gen_payload(
+                    self.candidate_inputs[i], args.output_length, args.lora_path
+                ),
+            )
+            for i in range(args.num_clients)
+        ]
+        self.client_records = {
+            i: {"round": 0, "history": init_requests[i][1]["text"]}
+            for i in range(args.num_clients)
+        }
+        self.ready_queue = ReadyQueue(
+            init_requests=init_requests, policy=args.ready_queue_policy
+        )
+        self.candidate_inputs = self.candidate_inputs[args.num_clients :]
+
+        self.response_queue = queue.Queue()
+        self.pbar = tqdm(total=args.num_clients * args.num_rounds)
+        self.performance_metrics = {
+            "ttft": [],
+            "latency": [],
+            "prompt_len": [],
+            "cached_tokens": [],
+            "generated_len": [],
+        }
+        self.num_rounds = args.num_rounds
+        self.max_parallel = args.max_parallel
+        self.output_length = args.output_length
+
+    async def handle_request(self, item):
+        try:
+            client_id, payload = item
+            response = await async_request_sglang_generate(payload, self.url, self.pbar)
+            if self.pbar.n == self.pbar.total:
+                self.finished_time = time.perf_counter()
+            self.response_queue.put((client_id, response))
+        except Exception as e:
+            print(f"Request failed: {e}")
+
+    def request_sender(self):
+        async def request_loop():
+            while True:
+                if self.sent_requests - self.completed_requests < self.max_parallel:
+                    new_request = self.ready_queue.pop()
+                    if new_request:
+                        asyncio.create_task(self.handle_request(new_request))
+                        self.sent_requests += 1
+                else:
+                    await asyncio.sleep(0.05)
+                    continue
+
+                if self.pbar.n == self.pbar.total:
+                    break
+
+                # Calculate Poisson-distributed wait time
+                if self.distribution == "poisson":
+                    sleep_time = random.expovariate(self.request_rate)
+                elif self.distribution == "uniform":
+                    avg_interval = (
+                        1.0 / self.request_rate if self.request_rate > 0 else 1.0
+                    )
+                    sleep_time = random.uniform(0, 2 * avg_interval)
+                else:
+                    raise ValueError("Invalid distribution type")
+                await asyncio.sleep(sleep_time)  # Wait before sending the next request
+
+        # Create and run the event loop for asynchronous requests
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(request_loop())
+        loop.close()
+
+    def response_handler(self):
+        while True:
+            try:
+                client_id, response = self.response_queue.get(
+                    timeout=10
+                )  # Block until response is available
+                if not response.success:
+                    raise ValueError(f"Request failed with error: {response.error}")
+                self.client_records[client_id]["history"] += response.generated_text
+                self.client_records[client_id]["round"] += 1
+                self.performance_metrics["ttft"].append(response.ttft)
+                self.performance_metrics["latency"].append(response.latency)
+                self.performance_metrics["prompt_len"].append(response.prompt_len)
+                self.performance_metrics["cached_tokens"].append(response.cached_tokens)
+                self.performance_metrics["generated_len"].append(response.generated_len)
+                self.completed_requests += 1
+
+                if self.client_records[client_id]["round"] < self.num_rounds:
+                    # append new request to client's history
+                    self.client_records[client_id][
+                        "history"
+                    ] += self.sub_question_inputs.pop().prompt
+                    self.ready_queue.append(
+                        (
+                            client_id,
+                            gen_payload(
+                                self.client_records[client_id]["history"],
+                                self.output_length,
+                                args.lora_path,
+                            ),
+                        )
+                    )
+            except queue.Empty:
+                if self.pbar.n == self.pbar.total:
+                    break
+            except ValueError as e:
+                print(f"Error processing response for client {client_id}: {e}")
+                continue
+
+    def run(self):
+        request_thread = threading.Thread(target=self.request_sender, daemon=True)
+        response_thread = threading.Thread(target=self.response_handler, daemon=True)
+
+        self.start_time = time.perf_counter()
+        request_thread.start()
+        response_thread.start()
+
+        request_thread.join()
+        response_thread.join()
+        self.pbar.close()
+
+        duration = self.finished_time - self.start_time
+        performance_data = {
+            "summary": {
+                "total_requests": len(self.performance_metrics["ttft"]),
+                "request_rate": self.request_rate,
+                "average_ttft": sum(self.performance_metrics["ttft"])
+                / len(self.performance_metrics["ttft"]),
+                "p90_ttft": sorted(self.performance_metrics["ttft"])[
+                    int(0.9 * len(self.performance_metrics["ttft"]))
+                ],
+                "median_ttft": sorted(self.performance_metrics["ttft"])[
+                    len(self.performance_metrics["ttft"]) // 2
+                ],
+                "average_latency": sum(self.performance_metrics["latency"])
+                / len(self.performance_metrics["latency"]),
+                "p90_latency": sorted(self.performance_metrics["latency"])[
+                    int(0.9 * len(self.performance_metrics["latency"]))
+                ],
+                "median_latency": sorted(self.performance_metrics["latency"])[
+                    len(self.performance_metrics["latency"]) // 2
+                ],
+                "input_token_throughput": sum(self.performance_metrics["prompt_len"])
+                / duration,
+                "output_token_throughput": sum(
+                    self.performance_metrics["generated_len"]
+                )
+                / duration,
+                "throughput": self.pbar.total / duration,
+                "cache_hit_rate": (
+                    0
+                    if sum(self.performance_metrics["prompt_len"]) == 0
+                    else sum(self.performance_metrics["cached_tokens"])
+                    / sum(self.performance_metrics["prompt_len"])
+                ),
+            },
+        }
+        print("All requests completed")
+        print("Performance metrics summary:")
+        print(
+            f"  Total requests: {performance_data['summary']['total_requests']} at {performance_data['summary']['request_rate']} requests per second"
+        )
+        print(f"  Average TTFT: {performance_data['summary']['average_ttft']:.2f}")
+        print(f"  P90 TTFT: {performance_data['summary']['p90_ttft']:.2f}")
+        print(f"  Median TTFT: {performance_data['summary']['median_ttft']:.2f}")
+        print(
+            f"  Average latency: {performance_data['summary']['average_latency']:.2f}"
+        )
+        print(f"  P90 latency: {performance_data['summary']['p90_latency']:.2f}")
+        print(f"  Median latency: {performance_data['summary']['median_latency']:.2f}")
+        print(
+            f"  Input token throughput: {performance_data['summary']['input_token_throughput']:.2f} tokens per second"
+        )
+        print(
+            f"  Output token throughput: {performance_data['summary']['output_token_throughput']:.2f} tokens per second"
+        )
+        print(
+            f"  Request Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
+        )
+        print(f"  Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
+        return performance_data
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if args.disable_auto_run:
+        print("Running with specified request rate...")
+        request_rates = [args.request_rate]
+    else:
+        print("Auto-running with different request rates...")
+        request_rates = [16, 14, 12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+
+    for rate in request_rates:
+        args.request_rate = rate
+        requests.post(flush_cache_url)
+        time.sleep(1)
+        performance_data = WorkloadGenerator(args).run()
+        log_to_jsonl_file(performance_data, args.log_file, tag=args.tag)
--- a/benchmark/hicache/bench_serving.py
+++ b/benchmark/hicache/bench_serving.py
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
+# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
+
+"""
+Benchmark online serving with dynamic requests.
+
+Usage:
+python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import random
+import sys
+import time
+import traceback
+import warnings
+from argparse import ArgumentParser
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+
+import aiohttp
+import numpy as np
+import requests
+from data_processing import MsgContent, SampleOutput, get_dataset
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from sglang.bench_serving import get_tokenizer, remove_prefix, set_ulimit
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=20 * 60 * 60)
+
+global args
+
+
+@dataclass
+class RequestFuncInput:
+    prompts: List[Tuple[MsgContent, int, int]]
+    api_url: str
+    model: str
+    lora_name: str
+    extra_request_body: Dict[str, Any]
+
+    # For multiturn chat, store the context
+    prev_messages: List = field(default_factory=list)
+    finished_prompts: int = 0
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: List[str] = field(default_factory=list)
+    prompt_len: List[int] = field(default_factory=list)
+    output_len: List[int] = field(default_factory=list)
+    latency: List[float] = field(default_factory=list)
+    ttft: List[float] = field(default_factory=list)
+    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+
+    success: bool = False
+    error: str = ""
+
+
+# set ignore_eos True by default
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    queue: asyncio.Queue,
+    tokenizer: PreTrainedTokenizerBase,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model,
+            "temperature": 0.0,
+            "best_of": 1,
+            "stream": not args.disable_stream,
+            "stream_options": {"include_usage": True},
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+
+        prompt_idx = request_func_input.finished_prompts
+        messages = request_func_input.prev_messages
+        prompt, input_len, max_tokens = request_func_input.prompts[prompt_idx]
+        prompt_len = sum(
+            prompt[1] + prompt[2]  # input_len + output_len
+            for prompt in request_func_input.prompts[:prompt_idx]
+        )
+        prompt_len += input_len
+
+        # Messages
+        messages.append(
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        )
+        payload["messages"] = messages
+        payload["max_tokens"] = max_tokens
+
+        # output.prompt_len = request_func_input.prompt_len
+        # print(payload)
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    actual_prompt_len = prompt_len - 1
+                    actual_output_len = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+                            timestamp = time.perf_counter()
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["usage"] is not None and len(data["usage"]) > 0:
+                                actual_prompt_len = data["usage"]["prompt_tokens"]
+                                actual_output_len = data["usage"]["completion_tokens"]
+                                continue
+                            delta = data["choices"][0]["delta"]
+
+                            if delta.get("content", None):
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft.append(ttft)
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += delta["content"]
+                            most_recent_timestamp = timestamp
+
+                    output.prompt_len.append(actual_prompt_len)  # truncate <s>
+                    output.output_len.append(actual_output_len)
+                    output.generated_text.append(generated_text)
+                    output.success = True
+                    output.latency.append(latency)
+
+                    # Prepare for the new request
+                    request_func_input.prompts[prompt_idx] = (
+                        prompt,
+                        input_len,
+                        actual_output_len,  # changes from max_tokens to output_len
+                    )
+                    prompt_idx += 1
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": generated_text,
+                        }
+                    )
+
+                    # Move the new request to the end of the queue
+                    if prompt_idx < len(request_func_input.prompts):
+                        request_func_input.finished_prompts = prompt_idx
+                        request_func_input.prev_messages = messages
+                        await queue.put(request_func_input)
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_profile(api_url: str) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        output = RequestFuncOutput()
+        try:
+            async with session.post(url=api_url) as response:
+                if response.status == 200:
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "sglang": async_request_openai_completions,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+}
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    total_output_retokenized: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    output_throughput_retokenized: float
+    total_throughput: float
+    total_throughput_retokenized: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    p90_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    p90_tpot_ms: float
+    p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    p90_itl_ms: float
+    p99_itl_ms: float
+    mean_e2e_latency_ms: float
+    median_e2e_latency_ms: float
+    std_e2e_latency_ms: float
+    p99_e2e_latency_ms: float
+    concurrency: float
+
+
+async def get_requests(
+    input_requests_queue: asyncio.Queue,
+    request_rate: float,
+    num_actual_requests: int,
+) -> AsyncGenerator[RequestFuncInput, None]:
+    for _ in range(num_actual_requests):
+        try:
+            request = await asyncio.wait_for(
+                input_requests_queue.get(), timeout=300
+            )  # Wait for 5 minutes then abort
+        except Exception as e:
+            print(f"exception: {e}")
+            break
+
+        yield request
+
+        if request_rate == float("inf"):
+            continue
+
+        interval = np.random.exponential(1.0 / request_rate)
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    backend: str,
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
+    e2e_latencies: List[float] = []
+    output_success = 0
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_success += 1
+            assert len(outputs[i].generated_text) == len(outputs[i].latency)
+            assert len(outputs[i].generated_text) == len(outputs[i].ttft)
+            for j in range(len(outputs[i].generated_text)):
+                output_len = outputs[i].output_len[j]
+                output_lens.append(output_len)
+                retokenized_output_len = len(
+                    tokenizer.encode(
+                        outputs[i].generated_text[j], add_special_tokens=False
+                    )
+                )
+                retokenized_output_lens.append(retokenized_output_len)
+                total_input += outputs[i].prompt_len[j]
+                if output_len > 1:
+                    tpots.append(
+                        (outputs[i].latency[j] - outputs[i].ttft[j]) / (output_len - 1)
+                    )
+
+                completed += 1
+            itls += outputs[i].itl
+            ttfts += outputs[i].ttft
+            e2e_latencies += outputs[i].latency
+
+        else:
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        total_throughput=(total_input + sum(output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
+        / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        p90_ttft_ms=np.percentile(ttfts or 0, 90) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        p90_tpot_ms=np.percentile(tpots or 0, 90) * 1000,
+        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        p90_itl_ms=np.percentile(itls or 0, 90) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
+        median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
+        std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
+        p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
+        concurrency=np.sum(e2e_latencies) / dur_s,
+    )
+
+    return metrics, output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: SampleOutput,
+    request_rate: float,
+    max_concurrency: Optional[int],
+    disable_tqdm: bool,
+    lora_name: str,
+    extra_request_body: Dict[str, Any],
+    profile: bool,
+    enable_shared_prefix: bool,
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    # Limit concurrency
+    # From https://github.com/vllm-project/vllm/pull/9390
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, queue, tokenizer, pbar):
+        if semaphore is None:
+            return await request_func(
+                request_func_input=request_func_input,
+                queue=queue,
+                tokenizer=tokenizer,
+                pbar=pbar,
+            )
+        async with semaphore:
+            return await request_func(
+                request_func_input=request_func_input,
+                queue=queue,
+                tokenizer=tokenizer,
+                pbar=pbar,
+            )
+
+    num_actual_requests = sum(len(r) for r in input_requests)
+    print(f"Num of shared prefixes or conversations: {len(input_requests)}")
+    print(f"Num of total requests: {num_actual_requests}")
+
+    # flatten the requests for shared prefix
+    if enable_shared_prefix:
+        input_requests = [[r] for requests in input_requests for r in requests]
+    inputs_requests_queue = asyncio.Queue(maxsize=len(input_requests))
+    print("Starting initial single prompt test run...")
+    # NOTE: Just use the first request of the first conversation for warmup
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompts=input_requests[0][:1],
+        api_url=api_url,
+        lora_name=lora_name,
+        extra_request_body=extra_request_body,
+    )
+    test_output = await request_func(
+        request_func_input=test_input, queue=inputs_requests_queue, tokenizer=tokenizer
+    )
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    # Check the states
+    assert inputs_requests_queue.empty()
+
+    # Flush cache
+    if "sglang" in backend:
+        requests.post(base_url + "/flush_cache")
+
+    time.sleep(1.0)
+
+    # Start profiler
+    if profile:
+        print("Starting profiler...")
+        profile_output = await async_request_profile(
+            api_url=base_url + "/start_profile"
+        )
+        if profile_output.success:
+            print("Profiler started")
+
+    for request in input_requests:
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompts=request,
+            api_url=api_url,
+            lora_name=lora_name,
+            extra_request_body=extra_request_body,
+        )
+        inputs_requests_queue.put_nowait(request_func_input)
+    if (
+        not args.enable_multiturn
+        and not args.enable_shared_prefix
+        and not args.dataset_name == "generated-shared-prefix"
+    ):
+        assert len(input_requests) == num_actual_requests
+
+    pbar = None if disable_tqdm else tqdm(total=num_actual_requests)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    async for request in get_requests(
+        inputs_requests_queue, request_rate, num_actual_requests
+    ):
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(
+                    request_func_input=request,
+                    queue=inputs_requests_queue,
+                    tokenizer=tokenizer,
+                    pbar=pbar,
+                )
+            )
+        )
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    # Stop profiler
+    if profile:
+        print("Stopping profiler...")
+        profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    # Compute metrics and print results
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+    metrics, output_lens = calculate_metrics(
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        backend=backend,
+    )
+
+    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
+    print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print(
+        "{:<40} {:<10}".format(
+            "Max request concurrency:",
+            max_concurrency if max_concurrency else "not set",
+        )
+    )
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", metrics.input_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", metrics.total_throughput
+        )
+    )
+    print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
+    print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P90 TTFT (ms):", metrics.p90_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print(
+        "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
+    )
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P90 TPOT (ms):", metrics.p90_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P90 ITL (ms):", metrics.p90_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("=" * 50)
+
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            # Arguments
+            "backend": args.backend,
+            "dataset_name": args.dataset_name,
+            "request_rate": request_rate,
+            "max_concurrency": max_concurrency,
+            "fixed_output_len": args.fixed_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            # Results
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "request_throughput": metrics.request_throughput,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "std_e2e_latency_ms": metrics.std_e2e_latency_ms,
+            "p99_e2e_latency_ms": metrics.p99_e2e_latency_ms,
+            "mean_ttft_ms": metrics.mean_ttft_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "std_ttft_ms": metrics.std_ttft_ms,
+            "p99_ttft_ms": metrics.p99_ttft_ms,
+            "mean_tpot_ms": metrics.mean_tpot_ms,
+            "median_tpot_ms": metrics.median_tpot_ms,
+            "std_tpot_ms": metrics.std_tpot_ms,
+            "p99_tpot_ms": metrics.p99_tpot_ms,
+            "mean_itl_ms": metrics.mean_itl_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "std_itl_ms": metrics.std_itl_ms,
+            "p99_itl_ms": metrics.p99_itl_ms,
+            "concurrency": metrics.concurrency,
+            "input_throughput": metrics.input_throughput,
+            "output_throughput": metrics.output_throughput,
+            "fixed_output_len": args.fixed_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        if args.dataset_name == "random":
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+        else:
+            output_file_name = (
+                f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
+            )
+
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        file.write(json.dumps(result) + "\n")
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
+        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
+        "p90_ttft_ms": metrics.p90_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
+        "p90_tpot_ms": metrics.p90_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
+        "p90_itl_ms": metrics.p90_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+    }
+    return result
+
+
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+
+    # Set default value for max_concurrency if not present
+    if not hasattr(args, "max_concurrency"):
+        args.max_concurrency = None
+
+    # Set global environments
+    set_ulimit()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    extra_request_body = {}
+    if args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
+
+    # Set url
+    if args.port is None:
+        args.port = {
+            "sglang": 30000,
+            "lmdeploy": 23333,
+            "vllm": 8000,
+        }.get(args.backend, 30000)
+
+    model_url = (
+        f"{args.base_url}/v1/models"
+        if args.base_url
+        else f"http://{args.host}:{args.port}/v1/models"
+    )
+
+    if args.backend in ["sglang", "vllm", "lmdeploy"]:
+        api_url = (
+            f"{args.base_url}/v1/chat/completions"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/chat/completions"
+        )
+    base_url = (
+        f"http://{args.host}:{args.port}" if args.base_url is None else args.base_url
+    )
+
+    # Get model name
+    if args.model is None:
+        if args.backend == "truss":
+            print(
+                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
+            )
+            sys.exit(1)
+        try:
+            response = requests.get(model_url)
+            model_list = response.json().get("data", [])
+            args.model = model_list[0]["id"] if model_list else None
+        except Exception as e:
+            print(f"Failed to fetch model from {model_url}. Error: {e}")
+            print(
+                "Please specify the correct host and port using `--host` and `--port`."
+            )
+            sys.exit(1)
+
+    if args.model is None:
+        print("No model specified or found. Please provide a model using `--model`.")
+        sys.exit(1)
+
+    # Dataset compatibility check
+    if args.enable_multiturn:
+        # TODO: Support multiturn for random
+        if args.dataset_name not in ["sharegpt", "ultrachat", "loogle", "nextqa"]:
+            print(
+                "Multiturn conversation is only supported for sharegpt, ultrachat, loogle, and nextqa datasets."
+            )
+            sys.exit(1)
+
+    if args.enable_shared_prefix:
+        if args.dataset_name not in ["loogle", "nextqa"]:
+            print("Shared prefix is only supported for loogle and nextqa datasets.")
+            sys.exit(1)
+
+    print(f"{args}\n")
+
+    # Read dataset
+    backend = args.backend
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+
+    tokenizer = get_tokenizer(tokenizer_id)
+
+    input_requests = get_dataset(args, tokenizer)
+
+    return asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            max_concurrency=args.max_concurrency,
+            disable_tqdm=args.disable_tqdm,
+            lora_name=args.lora_name,
+            extra_request_body=extra_request_body,
+            profile=args.profile,
+            enable_shared_prefix=args.enable_shared_prefix,
+        )
+    )
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        default="sglang",
+        help="Must specify a backend, depending on the LLM Inference Engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=[
+            "sharegpt",
+            "random",
+            "generated-shared-prefix",
+            "ultrachat",
+            "loogle",
+            "nextqa",
+        ],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--dataset-path", type=str, default="", help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        help="Name or path of the model. If not set, the default model will request /v1/models for conf.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer. If not set, using the model conf.",
+    )
+    parser.add_argument(
+        "--chat-template",
+        type=str,
+        help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process. Default is 1000.",
+    )
+    parser.add_argument(
+        "--fixed-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length from the dataset.",
+    )
+    parser.add_argument(
+        "--sharegpt-context-len",
+        type=int,
+        default=None,
+        help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
+    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        default=1024,
+        type=int,
+        help="Number of output tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random dataset.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+    parser.add_argument(
+        "--multi",
+        action="store_true",
+        help="Use request rate range rather than single value.",
+    )
+    parser.add_argument(
+        "--request-rate-range",
+        type=str,
+        default="2,34,2",
+        help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--enable-multiturn",
+        action="store_true",
+        help="Enable multiturn chat for online serving benchmarking. "
+        "This option is effective on the following datasets: "
+        "sharegpt, ultrachat, loogle, nextqa",
+    )
+    parser.add_argument(
+        "--enable-shared-prefix",
+        action="store_true",
+        help="Enable shared prefix for online serving benchmarking. "
+        "This option is effective on the following datasets: "
+        "loogle, nextqa",
+    )
+
+    parser.add_argument(
+        "--disable-shuffle",
+        action="store_true",
+        help="Disable shuffling datasets. This is useful to generate stable output "
+        "in benchmarking",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--disable-stream",
+        action="store_true",
+        help="Disable streaming mode.",
+    )
+    parser.add_argument(
+        "--return-logprob",
+        action="store_true",
+        help="Return logprob.",
+    )
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    parser.add_argument(
+        "--disable-ignore-eos",
+        action="store_true",
+        help="Disable ignoring EOS.",
+    )
+    parser.add_argument(
+        "--extra-request-body",
+        metavar='{"key1": "value1", "key2": "value2"}',
+        type=str,
+        help="Append given JSON object to the request payload. You can use this to specify"
+        "additional generate params like sampling params.",
+    )
+    parser.add_argument(
+        "--apply-chat-template",
+        action="store_true",
+        help="Apply chat template",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "SGLANG_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--lora-name",
+        type=str,
+        default=None,
+        help="The name of LoRA adapter",
+    )
+
+    group = parser.add_argument_group("generated-shared-prefix dataset arguments")
+    group.add_argument(
+        "--gsp-num-groups",
+        type=int,
+        default=64,
+        help="Number of system prompt groups for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-prompts-per-group",
+        type=int,
+        default=16,
+        help="Number of prompts per system prompt group for generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-system-prompt-len",
+        type=int,
+        default=2048,
+        help="Target length in tokens for system prompts in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-question-len",
+        type=int,
+        default=128,
+        help="Target length in tokens for questions in generated-shared-prefix dataset",
+    )
+    group.add_argument(
+        "--gsp-output-len",
+        type=int,
+        default=256,
+        help="Target length in tokens for outputs in generated-shared-prefix dataset",
+    )
+    # videos specific
+    parser.add_argument(
+        "--max-frames",
+        type=int,
+        default=sys.maxsize,
+        help="The maximum number of frames to extract from each video. "
+        "This option is specific to the nextqa dataset (video benchmark). ",
+    )
+    args = parser.parse_args()
+
+    if args.enable_multiturn and args.enable_shared_prefix:
+        parser.error(
+            "--enable-multiturn and --enable-shared-prefix cannot be set at the same time."
+        )
+
+    run_benchmark(args)
--- a/benchmark/hicache/data_processing.py
+++ b/benchmark/hicache/data_processing.py
+import json
+import os
+import pickle
+import random
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+from nextqa import NExTQALoader
+
+# from nextqa.video import , VideoPrompt
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
+
+from sglang.bench_serving import (
+    download_and_cache_file,
+    gen_prompt,
+    get_gen_prefix_cache_path,
+)
+from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionMessageContentPart
+from sglang.utils import encode_video_base64
+
+# type of content fields, can be only prompts or with images/videos
+MsgContent = Union[str, List[ChatCompletionMessageContentPart]]
+
+# A list of all the conversations. Each conversation is a list of
+# tuples. If multiturn is not enabled, the length of list is 1,
+# containing only the first Q&A pair.
+# For the shared prefix workload (synthetic, loogle, nextqa), it
+# is a list of conversations sharing the same prefix (synthetic,
+# doc, video)
+SampleOutput = List[List[Tuple[MsgContent, int, int]]]
+
+
+def common_filter_chat(
+    num_requests: int,
+    new_dataset: List,
+    tokenizer: PreTrainedTokenizerBase,
+    min_prompt_len: Optional[int],
+    min_output_len: Optional[int],
+    max_prompt_len: Optional[int],
+    max_output_len: Optional[int],
+    fixed_output_len: Optional[int],
+) -> SampleOutput:
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = []
+    l = 0
+    input_tokens = 0
+    output_tokens = 0
+    while l < num_requests:
+        for i in range(len(new_dataset)):
+            if l == num_requests:
+                break
+            processed = []
+            for j in new_dataset[i]:
+                # Tokenize the prompts and completions.
+                prompt = j[0]
+                prompt_token_ids = tokenizer.encode(prompt)
+                prompt_len = len(prompt_token_ids)
+
+                completion = j[1]
+                completion_token_ids = tokenizer.encode(completion)
+                output_len = (
+                    len(completion_token_ids)
+                    if fixed_output_len is None
+                    else fixed_output_len
+                )
+                if (
+                    min_prompt_len is not None
+                    and prompt_len < min_prompt_len
+                    or min_output_len is not None
+                    and output_len < min_output_len
+                    or max_prompt_len is not None
+                    and prompt_len > max_prompt_len
+                    or max_output_len is not None
+                    and output_len > max_output_len
+                ):
+                    # Prune too short sequences.
+                    continue
+                input_tokens += prompt_len
+                output_tokens += output_len
+                processed.append((prompt, prompt_len, output_len))
+            if len(processed) != 0:
+                filtered_dataset.append(processed)
+                l += 1
+
+    print(f"#Input tokens: {input_tokens}")
+    print(f"#Output tokens: {output_tokens}")
+    return filtered_dataset
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Download sharegpt if necessary
+    if not os.path.isfile(dataset_path):
+        dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+
+    # Keep one conversation in one list
+    new_dataset = []
+    for data in dataset:
+        if len(data["conversations"]) % 2 != 0:
+            continue
+        if data["conversations"][0]["from"] != "human":
+            continue
+        chat = []
+        total_len = 2
+        if enable_multiturn:
+            total_len = len(data["conversations"])
+        for i in range(0, total_len, 2):
+            # One user One Assistant
+            chat.append(
+                (
+                    data["conversations"][i]["value"],
+                    data["conversations"][i + 1]["value"],
+                )
+            )
+        new_dataset.append(chat)
+
+    if not disable_shuffle:
+        # Shuffle the dataset.
+        random.shuffle(new_dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = common_filter_chat(
+        num_requests, new_dataset, tokenizer, 4, 4, None, None, fixed_output_len
+    )
+    return filtered_dataset
+
+
+def sample_ultrachat_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset
+    dataset = []
+    with open(dataset_path) as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            dataset.append(json.loads(line))
+
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["data"]) >= 2]
+
+    # Keep one conversation in one list
+    new_dataset = []
+    for data in dataset:
+        if len(data["data"]) % 2 != 0:
+            continue
+        chat = []
+        total_len = 2
+        if enable_multiturn:
+            total_len = len(data["data"])
+        for i in range(0, total_len, 2):
+            # One user One Assistant
+            chat.append((data["data"][i], data["data"][i + 1]))
+        new_dataset.append(chat)
+
+    # Shuffle the dataset.
+    if not disable_shuffle:
+        random.shuffle(new_dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = common_filter_chat(
+        num_requests, new_dataset, tokenizer, 4, 4, None, None, fixed_output_len
+    )
+    return filtered_dataset
+
+
+def sample_loogle_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,
+    enable_shared_prefix: bool = False,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset
+    dataset = []
+    with open(dataset_path) as f:
+        while True:
+            line = f.readline()
+            if not line:
+                break
+            dataset.append(json.loads(line))
+
+    # Keep one conversation in one list
+    new_dataset = []
+    # TODO: Add shared prefix support for loogle
+    # NOTE: Now we preprocess it only for chat
+    for data in dataset:
+        chat = []
+        if (
+            "qa_pairs" not in data
+            or data["qa_pairs"] == "none"
+            or len(data["qa_pairs"]) == 0
+        ):
+            # If Q is none (for summarization),
+            # We add a question for summarization
+            # And keep the summary up to 1024 words
+            chat.append(
+                (
+                    "Input: "
+                    + data["input"]
+                    + " Question: "
+                    + "Please summarize the input",
+                    data["input"][:1024],
+                )
+            )
+            new_dataset.append(chat)
+        else:
+            qa_pairs = eval(data["qa_pairs"])
+            for i, qa in enumerate(qa_pairs):
+                if i == 0 or enable_shared_prefix:
+                    # Combine input with the first Q
+                    chat.append(
+                        ("Input: " + data["input"] + " Question: " + qa["Q"], qa["A"])
+                    )
+                elif enable_multiturn:
+                    chat.append((qa["Q"], qa["A"]))
+
+            new_dataset.append(chat)
+
+    # Shuffle the dataset.
+    if not disable_shuffle:
+        random.shuffle(new_dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: SampleOutput = common_filter_chat(
+        num_requests, new_dataset, tokenizer, 4, None, None, None, fixed_output_len
+    )
+    return filtered_dataset
+
+
+def sample_nextqa_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    max_frames: int,  # Specific for video
+    model_path: str,
+    disable_shuffle: bool = False,
+    enable_multiturn: bool = True,  # No multiturn support for now
+    backend: str = "sglang-oai",
+    chat_template_name: Optional[str] = None,
+    fixed_output_len: Optional[int] = None,
+) -> SampleOutput:
+    """
+    Example of messages:
+    message = {
+        "role": "user",
+        "content": [
+            {"type": "image_url", "image_url": {"url": base64_data}},
+            {"type": "text", "text": video.prompt},
+        ],
+    }
+    """
+
+    if fixed_output_len is None:
+        fixed_output_len = 4096
+
+    # TODO: Check for multiturn
+    dataset = NExTQALoader(video_dir=dataset_path, max_frames=max_frames)
+    new_dataset = []
+    for v in dataset:
+        new_dataset.append(v)
+
+    if not disable_shuffle:
+        random.shuffle(new_dataset)
+
+    # TODO: prompt len can get from server side
+    filtered_dataset = []
+    l = 0
+    while l < num_requests:
+        for i in range(len(new_dataset)):
+            if l == num_requests:
+                break
+
+            video = new_dataset[i]
+
+            # text prompt
+            prompt = video.prompt
+
+            # NOTE: Chat Template is a must for video benchmark because we have to
+            # add special image token for later expansion
+            if backend == "sglang" or backend == "sglang-native":
+                if "chat_template" in tokenizer.init_kwargs:
+                    chat_template = get_chat_template(tokenizer.get_chat_template())
+                elif chat_template_name is not None:
+                    chat_template = get_chat_template(chat_template_name)
+                else:
+                    chat_template = get_chat_template_by_model_path(model_path)
+                prompt = chat_template.image_token + prompt
+
+            prompt_token_ids = tokenizer(prompt).input_ids
+            prompt_len = len(prompt_token_ids)
+            output_len = fixed_output_len  # max output len, not real output len
+
+            # video input
+            base64_data = encode_video_base64(video.path, video.num_frames)
+
+            # NOTE: This will be replaced by the expanded length from the server
+            prompt_len += video.num_frames
+
+            # add to content
+            content = [
+                {"type": "image_url", "image_url": {"url": base64_data}},
+                {"type": "text", "text": prompt},
+            ]
+
+            filtered_dataset.append([(content, prompt_len, output_len)])
+            l += 1
+    return filtered_dataset
+
+
+def sample_random_requests(
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    dataset_path: str,
+    disable_shuffle: bool = False,
+) -> SampleOutput:
+
+    input_lens = np.random.randint(
+        max(int(input_len * range_ratio), 1),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+
+    if True:
+        # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
+
+        # Download sharegpt if necessary
+        if not os.path.isfile(dataset_path):
+            dataset_path = download_and_cache_file(SHAREGPT_URL)
+
+        # Load the dataset.
+        with open(dataset_path) as f:
+            dataset = json.load(f)
+        # Filter out the conversations with less than 2 turns.
+        dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+        # Only keep the first two turns of each conversation.
+        dataset = [
+            (data["conversations"][0]["value"], data["conversations"][1]["value"])
+            for data in dataset
+        ]
+
+        if not disable_shuffle:
+            # Shuffle the dataset.
+            random.shuffle(dataset)
+
+        # Filter out sequences that are too long or too short
+        input_requests: SampleOutput = []
+        for data in dataset:
+            i = len(input_requests)
+            if i == num_prompts:
+                break
+
+            # Tokenize the prompts and completions.
+            prompt = data[0]
+            prompt_token_ids = tokenizer.encode(prompt)
+            prompt_len = len(prompt_token_ids)
+
+            # Skip empty prompt
+            if prompt_len == 0:
+                continue
+
+            if prompt_len > input_lens[i]:
+                input_ids = prompt_token_ids[: input_lens[i]]
+            else:
+                ratio = (input_lens[i] + prompt_len - 1) // prompt_len
+                input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
+            prompt = tokenizer.decode(input_ids)
+            input_requests.append([(prompt, int(input_lens[i]), int(output_lens[i]))])
+    else:
+        # Sample token ids from random integers. This can cause some NaN issues.
+        offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+        input_requests = []
+        for i in range(num_prompts):
+            prompt = tokenizer.decode(
+                [
+                    (offsets[i] + i + j) % tokenizer.vocab_size
+                    for j in range(input_lens[i])
+                ]
+            )
+            input_requests.append([(prompt, int(input_lens[i]), int(output_lens[i]))])
+
+    print(f"#Input tokens: {np.sum(input_lens)}")
+    print(f"#Output tokens: {np.sum(output_lens)}")
+    return input_requests
+
+
+def gen_prompt(tokenizer, token_num):
+    """Generate a random prompt of specified token length using tokenizer vocabulary."""
+    all_available_tokens = list(tokenizer.get_vocab().values())
+    selected_tokens = random.choices(all_available_tokens, k=token_num)
+    return tokenizer.decode(selected_tokens)
+
+
+def get_gen_prefix_cache_path(args, tokenizer):
+    """Create cache directory under ~/.cache/sglang/benchmark"""
+    cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
+
+    # Create a unique cache filename based on the generation parameters
+    cache_key = (
+        f"gsp_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
+        f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
+        f"{tokenizer.__class__.__name__}.pkl"
+    )
+    return cache_dir / cache_key
+
+
+def sample_generated_shared_prefix_requests(
+    num_groups: int,
+    prompts_per_group: int,
+    system_prompt_len: int,
+    question_len: int,
+    output_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+    args,
+    disable_shuffle: bool = False,
+) -> SampleOutput:
+    """Generate benchmark requests with shared system prompts using random tokens and caching."""
+    cache_path = get_gen_prefix_cache_path(args, tokenizer)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"\nLoading cached generated input data from {cache_path}")
+        with open(cache_path, "rb") as f:
+            return pickle.load(f)
+
+    print("\nGenerating new input data...")
+
+    # Generate system prompts for each group
+    system_prompts = []
+    for _ in range(num_groups):
+        system_prompt = gen_prompt(tokenizer, system_prompt_len)
+        system_prompts.append(system_prompt)
+
+    # Generate questions
+    questions = []
+    for _ in range(num_groups * prompts_per_group):
+        question = gen_prompt(tokenizer, question_len)
+        questions.append(question)
+
+    # Combine system prompts with questions
+    input_requests = []
+    total_input_tokens = 0
+    total_output_tokens = 0
+
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
+        system_prompt = system_prompts[group_idx]
+        input_requests.append([])
+        for prompt_idx in tqdm(
+            range(prompts_per_group), desc="Generating questions", leave=False
+        ):
+            question = questions[group_idx * prompts_per_group + prompt_idx]
+            full_prompt = f"{system_prompt}\n\n{question}"
+            prompt_len = len(tokenizer.encode(full_prompt))
+            input_requests[-1].append((full_prompt, prompt_len, output_len))
+            total_input_tokens += prompt_len
+            total_output_tokens += output_len
+
+    if not disable_shuffle:
+        # Shuffle questions
+        random.shuffle(input_requests)
+
+    # Print statistics
+    print(f"\nGenerated shared prefix dataset statistics:")
+    print(f"Number of groups: {num_groups}")
+    print(f"Prompts per group: {prompts_per_group}")
+    print(f"Total prompts: {len(input_requests) * prompts_per_group}")
+    print(f"Total input tokens: {total_input_tokens}")
+    print(f"Total output tokens: {total_output_tokens}")
+    print(
+        f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
+    )
+    print(
+        f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
+    )
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Caching generated input data to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(input_requests, f)
+
+    return input_requests
+
+
+def get_dataset(args, tokenizer):
+    if args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "ultrachat":
+        input_requests = sample_ultrachat_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "loogle":
+        input_requests = sample_loogle_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            enable_shared_prefix=args.enable_shared_prefix,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "nextqa":
+        input_requests = sample_nextqa_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            max_frames=args.max_frames,
+            model_path=args.model,
+            disable_shuffle=args.disable_shuffle,
+            enable_multiturn=args.enable_multiturn,
+            backend=args.backend,
+            chat_template_name=args.chat_template,
+            fixed_output_len=args.fixed_output_len,
+        )
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+    elif args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gsp_num_groups,
+            prompts_per_group=args.gsp_prompts_per_group,
+            system_prompt_len=args.gsp_system_prompt_len,
+            question_len=args.gsp_question_len,
+            output_len=args.gsp_output_len,
+            args=args,
+            tokenizer=tokenizer,
+        )
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    return input_requests
--- a/benchmark/hicache/download.sh
+++ b/benchmark/hicache/download.sh
+#!/usr/bin/bash
+
+# The usage function
+usage() {
+    echo "Usage: $0 {sharegpt|ultragpt|loogle|nextqa|all}"
+    exit 1
+}
+
+# The download function
+download() {
+    case "$1" in
+        sharegpt)
+            echo $1
+            wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+            ;;
+        ultragpt)
+            echo $1
+            # Questions about the world
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/be1d7b87-22ca-449e-a6a7-c61d1ea7e010/ultrachat_release_230407.json
+            # Writing and Creation
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/61742d2a-25e2-4d08-b2b9-15f47ae50ace/ultrachat_material_release_230417.json
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/f71f6aa6-d346-4b16-85b7-8502efa3d608/ultrachat_material_release_230412.json
+            # External materials
+            wget https://cloud.tsinghua.edu.cn/seafhttp/files/42d22e28-e899-4975-a70f-5eda163e265d/ultrachat_existent_material_release_230420.json.gz
+            gunzip ultrachat_existent_material_release_230420.json.gz
+            ;;
+        loogle)
+            echo $1
+            git lfs install
+            git clone git@hf.co:datasets/bigainlco/LooGLE
+            unzip LooGLE/data.zip
+            ;;
+        nextqa)
+            echo $1
+            git lfs install
+            git clone https://huggingface.co/datasets/lmms-lab/NExTQA
+            unzip NExTQA/videos.zip
+            ;;
+        *)
+            usage
+            exit 1
+            ;;
+    esac
+}
+
+# Arg check
+if [ "$#" -ne 1 ]; then
+    usage
+fi
+
+# Invoke
+
+case "$1" in
+    sharegpt|ultragpt|loogle|nextqa)
+        download "$1"
+        ;;
+    all)
+        download sharegpt
+        download ultragpt
+        download loogle
+        download nextqa
+        ;;
+    *)
+        usage
+        ;;
+esac
--- a/benchmark/hicache/nextqa.py
+++ b/benchmark/hicache/nextqa.py
+import os
+import sys
+from typing import List
+
+import av
+from datasets import load_dataset
+
+
+def find_video_files(video_dir) -> List[str]:
+    if os.path.isfile(video_dir):
+        return [video_dir]
+
+    video_files = []
+    for root, dirs, files in os.walk(video_dir):
+        for file in files:
+            if file.endswith((".mp4", ".avi", ".mov")):
+                video_files.append(os.path.join(root, file))
+            # if file is dir
+            elif os.path.isdir(file):
+                video_files.extend(find_video_files(file))
+    return video_files
+
+
+def video_frames(video_path, max_frames) -> int:
+    container = av.open(video_path)
+    total_frames = container.streams.video[0].frames
+    return min(total_frames, max_frames)
+
+
+class Video:
+    def __init__(self, video_path, num_frames):
+        self.path = video_path
+        self.num_frames = num_frames
+
+    def __str__(self):
+        return f"Video({self.path}, {self.num_frames})"
+
+    def __iter__(self):
+        return iter((self.path, self.num_frames))
+
+
+class VideoPrompt(Video):
+    def __init__(self, video_path, num_frames, prompt):
+        super().__init__(video_path, num_frames)
+        self.prompt = prompt
+
+    def __str__(self):
+        return f"VideoPrompt({self.path}, {self.num_frames}, {self.prompt})"
+
+    def __iter__(self):
+        return iter((self.path, self.num_frames, self.prompt))
+
+
+class VideoLoader:
+    pass
+
+
+class VideoFileLoader(VideoLoader):
+    """
+    Load all the videos in a directory
+    """
+
+    def __init__(self, video_dir, batch_size=1, max_frames=sys.maxsize):
+        super().__init__()
+        self.video_dir = video_dir
+        self.video_files = find_video_files(video_dir)
+        self.batch_size = batch_size
+        self.max_frames = max_frames
+        print(f"batch_size: {batch_size}, max_frames: {max_frames}")
+
+    def __iter__(self):  # (file, number of frames)
+        if self.batch_size == 1:
+            for video_file in self.video_files:
+                yield Video(video_file, video_frames(video_file, self.max_frames))
+        else:
+            batch = []
+            for video_file in self.video_files:
+                video = Video(video_file, video_frames(video_file, self.max_frames))
+                batch.append(video)
+                if len(batch) == self.batch_size:
+                    yield batch
+                    batch = []
+
+
+class NExTQALoader(VideoLoader):
+    """
+    Load vdideos and prompts from NExT dataset
+    set: train, test or validation
+    """
+
+    def __init__(
+        self, video_dir, batch_size=1, max_frames=sys.maxsize, dset="test", task="OE"
+    ):
+        """
+        task: 'MV' or 'OE'
+        """
+        super().__init__()
+        self.task = task
+        print(f"Loading the {dset} data of {task} from lmms-lab/NExTQA")
+        self.ds = load_dataset("lmms-lab/NExTQA", task)
+        self.ds = self.ds[dset]
+
+        # self.n = ds.num_rows
+        self.video_dir = video_dir
+        self.video_files = find_video_files(video_dir)
+        self.video_to_path = dict()
+        for video_file in self.video_files:
+            video_id = video_file.split("/")[-1].split(".")[0]
+            self.video_to_path[video_id] = video_file
+
+        self.batch_size = batch_size
+        self.max_frames = max_frames
+
+    def get_video_prompt(self, entry, max_frames) -> VideoPrompt:
+        # Get video
+        video_id = entry["video"]
+        video_path = self.video_to_path[video_id]
+        assert os.path.exists(video_path), f"Video not found: {video_path}"
+        num_frames = min(entry["frame_count"], max_frames)
+        video = Video(video_path, num_frames)
+        prompt = entry["question"] + "?"
+        if self.task == "MC":  # add choices
+            prompt += f' a0: {entry["a0"]}, a1: {entry["a1"]}, a2: {entry["a2"]}, a3: {entry["a3"]}'
+        return VideoPrompt(video_path, num_frames, prompt)
+
+    def __iter__(self):
+        if self.batch_size == 1:
+            for entry in self.ds:
+                yield self.get_video_prompt(entry, self.max_frames)
+        else:
+            batch = []
+            for entry in self.ds:
+                video = self.get_video_prompt(entry, self.max_frames)
+                batch.append(video)
+                if len(batch) == self.batch_size:
+                    yield batch
+                    batch = []
+
+
+# main
+if __name__ == "__main__":
+    video_dir = "./videos"
+    # video_loader = VideoFileLoader(video_dir, batch_size=16)
+    # for batch in video_loader:
+    #     print(f"Number of videos in batch: {len(batch)}")
+    #     for video_file, num_frames in batch:
+    #         print(f"Video: {video_file} number of frames: {num_frames}")
+
+    video_loader = NExTQALoader(video_dir, batch_size=16, dset="test", task="OE")
+    for batch in video_loader:
+        print(f"Number of videos in batch: {len(batch)}")
+        for video_file, num_frames, prompt in batch:
+            print(
+                f"Video: {video_file} number of frames: {num_frames}, prompt: {prompt}"
+            )
+        # break
+        # for video_file, prompt in batch:
+        #     print(f"Video: {video_file} prompt: {prompt}")
+        #     break
--- a/benchmark/json_decode_regex/README.md
+++ b/benchmark/json_decode_regex/README.md
+## Run benchmark
+
+### Build dataset
+```
+pip install wikipedia
+python3 build_dataset.py
+```
+
+### Dependencies
+
+```
+llama_cpp_python          0.2.19
+guidance                  0.1.10
+vllm                      0.2.5
+outlines                  0.0.22
+```
+
+### Benchmark sglang
+
+Run Llama-7B
+
+```
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+Run Mixtral-8x7B
+
+```
+python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
+```
+
+Benchmark
+
+```
+python3 bench_sglang.py --num-questions 10
+```
+
+
+### Benchmark Outlines + vLLM
+
+Run Llama-7B
+
+```
+python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+
+Benchmark
+
+```
+python3 bench_other.py --backend outlines --num-questions 10
+```
+
+
+### Benchmark guidance
+
+Run Llama-7B and benchmark
+
+```
+python3 bench_other.py --backend guidance --num-questions 10 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
--- a/benchmark/json_decode_regex/bench_other.py
+++ b/benchmark/json_decode_regex/bench_other.py
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+REGEX_LIST = r"\[(" + REGEX_STR + ", )*" + REGEX_STR + r"\]"
+
+
+# fmt: off
+def json_decode(document, generate):
+    s = "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += "{\n"
+    s += '  "name": '
+    s += generate(s, max_tokens=8, regex=REGEX_STR + ",") + "\n"
+    s += '  "country": '
+    s += generate(s, max_tokens=8, regex=REGEX_STR + ",") + "\n"
+    s += '  "latitude": '
+    s += generate(s, max_tokens=8, regex=REGEX_FLOAT + ",") + "\n"
+    s += '  "population": '
+    s += generate(s, max_tokens=8, regex=REGEX_INT + ",") + "\n"
+    s += '  "top 3 landmarks": '
+    s += generate(s, max_tokens=24, regex=REGEX_LIST) + "\n"
+    s += "}\n"
+
+    return s
+# fmt: on
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    arguments = []
+    for i in range(len(lines[: args.num_questions])):
+        arguments.append(
+            {
+                "document": lines[i]["document"],
+            }
+        )
+    states = [None] * len(arguments)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    def get_one_answer(i):
+        states[i] = json_decode(generate=call_generate, **arguments[i])
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(arguments))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(arguments)))),
+                    total=len(arguments),
+                )
+            )
+            for _ in rets:
+                pass
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_decode_regex",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/json_decode_regex/bench_sglang.py
+++ b/benchmark/json_decode_regex/bench_sglang.py
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STR
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+REGEX_LIST = r"\[(" + REGEX_STR + ", )*" + REGEX_STR + r"\]"
+
+# fmt: off
+@sgl.function
+def json_warm_up(s):
+    s += "The information about Hogwarts is in the following JSON format.\n"
+    with s.var_scope("json_output"):
+        s += "{\n"
+        s += '  "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "latitude": ' + sgl.gen("latitude", max_tokens=8, regex=REGEX_FLOAT + ",") + "\n"
+        s += '  "population": ' + sgl.gen("population", max_tokens=8, regex=REGEX_INT + ",") + "\n"
+        s += '  "top 3 landmarks": ' + sgl.gen( "landmarks", max_tokens=24, regex=REGEX_LIST) + "\n"
+        s += "}\n"
+    print(f'The warmp up json result is:\n{s["json_output"]}')
+# fmt: on
+
+# fmt: off
+@sgl.function
+def json_decode(s, document):
+    s += "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    with s.var_scope("json_output"):
+        s += "{\n"
+        s += '  "name": ' + sgl.gen("name", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "country": ' + sgl.gen("country", max_tokens=8, regex=REGEX_STR + ",") + "\n"
+        s += '  "latitude": ' + sgl.gen("latitude", max_tokens=8, regex=REGEX_FLOAT + ",") + "\n"
+        s += '  "population": ' + sgl.gen("population", max_tokens=8, regex=REGEX_INT + ",") + "\n"
+        s += '  "top 3 landmarks": ' + sgl.gen( "landmarks", max_tokens=24, regex=REGEX_LIST) + "\n"
+        s += "}\n"
+# fmt: on
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)
+    lines = list(lines)
+    arguments = []
+    for i in range(len(lines[: args.num_questions])):
+        arguments.append(
+            {
+                "document": lines[i]["document"],
+            }
+        )
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Warm up
+    json_warm_up.run().sync()
+
+    # Run requests
+    tic = time.perf_counter()
+    states = json_decode.run_batch(
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(f"tmp_{args.backend}_json_results.txt", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_decode_regex",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/json_decode_regex/build_dataset.py
+++ b/benchmark/json_decode_regex/build_dataset.py
+import json
+
+import transformers
+import wikipedia
+
+model_path = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(model_path)
+city_names = [
+    "los angles",
+    "london",
+    "tokyo",
+    "beijing",
+    "singapore",
+    "paris",
+    "dubai",
+    "sydney",
+    "moscow",
+    "rome",
+    "toronto",
+    "rio de janeiro",
+    "istanbul",
+    "berlin",
+    "auckland",
+    "buenos aires",
+    "mexico city",
+    "mumbai",
+    "seoul",
+    "bangkok",
+    "cairo",
+    "athens",
+    "jerusalem",
+]
+
+
+def get_content(city_name):
+    content = str(wikipedia.page(city_name).content)
+    content = content.replace("\n\n", "\n")
+
+    tokens = t.encode(content)
+
+    expected_tokens = 3000
+    truncate_len = int((expected_tokens / len(tokens)) * len(content))
+    truncate_content = content[:truncate_len]
+    truncate_tokens = t.encode(truncate_content)
+
+    # Count token
+    print(
+        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
+    )
+
+    return truncate_content
+
+
+if __name__ == "__main__":
+    with open("questions.jsonl", "w") as fout:
+        for city_name in city_names:
+            truncate_content = get_content(city_name)
+            fout.write(json.dumps({"document": truncate_content}) + "\n")
--- a/benchmark/json_jump_forward/README.md
+++ b/benchmark/json_jump_forward/README.md
+## Run benchmark
+
+### Dependencies
+
+```
+llama_cpp_python          0.2.38
+guidance                  0.1.10
+vllm                      0.2.7
+outlines                  0.0.25
+```
+
+### Build dataset
+
+When benchmarking long document information retrieval, run the following command to build the dataset:
+
+```bash
+pip install wikipedia
+python3 build_dataset.py
+```
+
+### Benchmark sglang
+
+Run Llama-7B
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+Benchmark Character Generation
+
+```bash
+python3 bench_sglang.py --mode character
+```
+
+Benchmark City Information Retrieval
+
+```bash
+python3 bench_sglang.py --mode city
+```
+
+
+### Benchmark Outlines + vLLM
+
+Run Llama-7B
+
+```bash
+python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+
+Benchmark Character Generation
+
+```bash
+python3 bench_other.py --mode character --backend outlines
+```
+
+Benchmark City Information Retrieval
+
+```bash
+python3 bench_other.py --mode city --backend outlines
+```
+
+### Benchmark guidance
+
+Run Llama-7B and benchmark character generation
+
+```bash
+python3 bench_other.py --mode character --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+Run Llama-7B and benchmark city information retrieval
+
+```bash
+python3 bench_other.py --mode city --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+Run Llama-7B and benchmark character generation
+
+```
+python3 bench_other.py --mode character --backend lmql --parallel 1
+```
+
+Run Llama-7B and benchmark city information retrieval
+
+```
+python3 bench_other.py --mode city --backend lmql --parallel 1
+```
--- a/benchmark/json_jump_forward/bench_other.py
+++ b/benchmark/json_jump_forward/bench_other.py
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+import guidance
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+# there are some FSM bugs with json regex converted from pydantic model
+# here use a string regex instead
+# regex_string = build_regex_from_object(HarryPoterRole)
+character_regex = (
+    r"""\{\n"""
+    + r"""    "name": "[\w\d\s]{1,16}",\n"""
+    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
+    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
+    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
+    + r"""    "wand": \{\n"""
+    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
+    + r"""        "core": "[\w\d\s]{1,16}",\n"""
+    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "alive": "(Alive|Deceased)",\n"""
+    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
+    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
+    + r"""\}"""
+)
+
+city_regex = (
+    r"""\{\n"""
+    + r"""  "name": "[\w\d\s]{1,16}",\n"""
+    + r"""  "country": "[\w\d\s]{1,16}",\n"""
+    + r"""  "latitude": [-+]?[0-9]*\.?[0-9]{0,2},\n"""
+    + r"""  "population": [-+]?[0-9]{1,9},\n"""
+    + r"""  "top 3 landmarks": \["[\w\d\s]{1,16}", "[\w\d\s]{1,16}", "[\w\d\s]{1,16}"\]\n"""
+    + r"""\}"""
+)
+
+# fmt: off
+def character_gen(name, generate):
+    s = name + " is a character in Harry Potter. Please fill in the following information about this character.\n"
+    s += generate(s, max_tokens=256, regex=character_regex)
+    return s
+# fmt: on
+
+# fmt: off
+def city_gen(document, generate):
+    s = "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += generate(s, max_tokens=256, regex=city_regex)
+    return s
+# fmt: on
+
+
+@guidance
+def character_maker(lm, name):
+    regex_str_no_quote = r"[\w\d\s]+"
+    regex_float = r"[0-9]+\.[0-9]+"
+    lm += f"""\
+    {name} is a character in Harry Potter. Please fill in the following information about this character.
+    {{
+        "name": "{guidance.gen("name", max_tokens=16, regex=regex_str_no_quote)}",
+        "house": "{guidance.select(options=['Gryffindor', 'Slytherin', 'Ravenclaw', 'Hufflepuff'], name='house')}",
+        "blood status": "{guidance.select(options=['Pure-blood', 'Half-blood', 'Muggle-born'], name='blood status')}",
+        "occupation": "{guidance.select(options=['student', 'teacher', 'auror', 'ministry of magic', 'death eater', 'order of the phoenix'], name='occupation')}",
+        "wand": {{
+            "wood": "{guidance.gen("wood", max_tokens=16, regex=regex_str_no_quote)}",
+            "core": "{guidance.gen('core', max_tokens=16, regex=regex_str_no_quote)}",
+            "length": {guidance.gen('length', max_tokens=10, regex=regex_float)}
+        }},
+        "alive": "{guidance.select(options=['Alive', 'Deceased'], name='alive')}",
+        "patronus": "{guidance.gen('patronus', max_tokens=16, regex=regex_str_no_quote)}",
+        "bogart": "{guidance.gen('bogart', max_tokens=16, regex=regex_str_no_quote)}"
+    }}
+    """
+
+    return lm
+
+
+async def call_generate_lmql(
+    prompt, temperature, max_tokens, regex, max_len=4096, model=None, **kwargs
+):
+    assert model is not None
+    import lmql
+
+    @lmql.query(model=model)
+    async def program(question, max_tokens, regex):
+        '''lmql
+        """{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and REGEX(ANSWER, regex)
+        return ANSWER
+        '''
+
+    return await program(
+        question=prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        max_len=max_len,
+        regex=regex,
+        **kwargs,
+    )
+
+
+@guidance
+def city_maker(lm, document):
+    regex_str_no_quote = r"[\w\d\s]+"
+    regex_float = r"[0-9]+\.[0-9]+"
+    lm += f"""\
+    Please extract the information of a city from the following wikipedia page.
+    Page begin.
+    {document}
+    Page end.
+    Here is the name, country, and symbol of the city in JSON format.
+    {{
+        "name": "{guidance.gen("name", max_tokens=16, regex=regex_str_no_quote)}",
+        "country": "{guidance.gen("country", max_tokens=16, regex=regex_str_no_quote)}",
+        "latitude": {guidance.gen("latitude", max_tokens=10, regex=regex_float)},
+        "population": {guidance.gen("population", max_tokens=10, regex=r"[0-9]+")},
+        "top 3 landmarks": [
+            "{guidance.gen("landmark1", max_tokens=16, regex=regex_str_no_quote)}", "{guidance.gen("landmark2", max_tokens=16, regex=regex_str_no_quote)}", "{guidance.gen("landmark3", max_tokens=16, regex=regex_str_no_quote)}"
+        ]
+    }}
+    """
+
+    return lm
+
+
+def bench_character(args):
+    arguments = []
+    with open(args.data_path, "r") as f:
+        for line in f:
+            arguments.append({"name": line.strip()})
+    arguments = arguments[: args.num_jsons]
+
+    states = [None] * len(arguments)
+
+    # Select backend
+    if args.backend == "outlines":
+        call_generate = partial(get_call_generate(args), temperature=0)
+
+        def get_one_answer(i):
+            states[i] = character_gen(**arguments[i], generate=call_generate)
+
+    elif args.backend == "guidance":
+        model = guidance.models.LlamaCpp(
+            args.model_path,
+            n_gpu_layers=-1,
+            n_ctx=args.n_ctx,
+        )
+
+        def get_one_answer(i):
+            lm = model + character_maker(**arguments[i])
+            states[i] = lm
+
+    elif args.backend == "lmql":
+        import asyncio
+
+        import lmql
+
+        model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
+        call_generate = partial(
+            call_generate_lmql,
+            model=model,
+            max_tokens=256,
+            regex=character_regex,
+        )
+
+        async def get_one_answer_async(i):
+            states[i] = await call_generate(prompt=arguments[i]["name"], temperature=0)
+
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+    tic = time.perf_counter()
+
+    if args.backend != "lmql":
+        if args.parallel == 1:
+            for i in tqdm(range(len(arguments))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                rets = list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(arguments)))),
+                        total=len(arguments),
+                    )
+                )
+                for _ in rets:
+                    pass
+    else:
+        batches = []
+        for i in range(0, len(arguments), args.parallel):
+            batches.append(list(range(i, min(i + args.parallel, len(arguments)))))
+        loop = asyncio.get_event_loop()
+
+        for bt in tqdm(batches):
+            loop.run_until_complete(
+                asyncio.gather(*[get_one_answer_async(i) for i in bt])
+            )
+
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def bench_city_doc(args):
+    arguments = []
+    for line in read_jsonl(args.data_path):
+        arguments.append({"document": line["document"]})
+    arguments = arguments[: args.num_jsons]
+
+    states = [None] * len(arguments)
+
+    # Select backend
+    if args.backend == "outlines":
+        call_generate = partial(get_call_generate(args), temperature=0)
+
+        def get_one_answer(i):
+            states[i] = city_gen(**arguments[i], generate=call_generate)
+
+    elif args.backend == "guidance":
+        model = guidance.models.LlamaCpp(
+            args.model_path,
+            n_gpu_layers=-1,
+            n_ctx=args.n_ctx,
+        )
+
+        def get_one_answer(i):
+            lm = model + city_maker(**arguments[i])
+            states[i] = lm
+
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(arguments))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = executor.map(get_one_answer, list(range(len(arguments))))
+            for _ in rets:
+                pass
+
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def main(args):
+    if args.mode == "character":
+        args.data_path = "dataset.txt"
+        states, latency = bench_character(args)
+    elif args.mode == "city":
+        args.data_path = "questions.jsonl"
+        states, latency = bench_city_doc(args)
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}_{args.mode}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_jump_forward",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "mode": args.mode,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str)
+    parser.add_argument("--num-jsons", type=int, default=50)
+    parser.add_argument(
+        "--mode", type=str, default="character", choices=["character", "city"]
+    )
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/json_jump_forward/bench_sglang.py
+++ b/benchmark/json_jump_forward/bench_sglang.py
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+# there are some FSM bugs with json regex converted from pydantic model
+# here use a string regex instead
+# regex_string = build_regex_from_object(HarryPoterRole)
+character_regex = (
+    r"""\{\n"""
+    + r"""    "name": "[\w\d\s]{1,16}",\n"""
+    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
+    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
+    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
+    + r"""    "wand": \{\n"""
+    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
+    + r"""        "core": "[\w\d\s]{1,16}",\n"""
+    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "alive": "(Alive|Deceased)",\n"""
+    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
+    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
+    + r"""\}"""
+)
+
+city_regex = (
+    r"""\{\n"""
+    + r"""  "name": "[\w\d\s]{1,16}",\n"""
+    + r"""  "country": "[\w\d\s]{1,16}",\n"""
+    + r"""  "latitude": [-+]?[0-9]*\.?[0-9]{0,2},\n"""
+    + r"""  "population": [-+]?[0-9]{1,9},\n"""
+    + r"""  "top 3 landmarks": \["[\w\d\s]{1,16}", "[\w\d\s]{1,16}", "[\w\d\s]{1,16}"\]\n"""
+    + r"""\}"""
+)
+
+# fmt: off
+@sgl.function
+def character_gen(s, name):
+    s += name + " is a character in Harry Potter. Please fill in the following information about this character.\n"
+    s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
+# fmt: on
+
+# fmt: off
+@sgl.function
+def city_gen(s, document):
+    s += "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += sgl.gen("json_output",max_tokens=256, regex=city_regex)
+# fmt: on
+
+
+def bench_city_doc(args):
+    arguments = []
+    for line in read_jsonl(args.data_path):
+        arguments.append({"document": line["document"]})
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = city_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def bench_character(args):
+    arguments = []
+    with open(args.data_path, "r") as f:
+        for line in f:
+            arguments.append({"name": line.strip()})
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = character_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    return states, latency
+
+
+def main(args):
+    if args.mode == "character":
+        args.data_path = "dataset.txt"
+        states, latency = bench_character(args)
+    elif args.mode == "city":
+        args.data_path = "questions.jsonl"
+        states, latency = bench_city_doc(args)
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}_{args.mode}.txt", states)
+    with open(f"{args.backend}_{args.mode}.json", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_jump_forward",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "mode": args.mode,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str)
+    parser.add_argument("--num-jsons", type=int, default=50)
+    parser.add_argument(
+        "--mode", type=str, default="character", choices=["character", "city"]
+    )
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/json_jump_forward/build_dataset.py
+++ b/benchmark/json_jump_forward/build_dataset.py
+import json
+
+import transformers
+import wikipedia
+
+model_path = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(model_path)
+city_names = [
+    "los angles",
+    "london",
+    "tokyo",
+    "beijing",
+    "singapore",
+    "paris",
+    "dubai",
+    "sydney",
+    "moscow",
+    "rome",
+    "toronto",
+    "rio de janeiro",
+    "istanbul",
+    "berlin",
+    "auckland",
+    "buenos aires",
+    "mexico city",
+    "mumbai",
+    "seoul",
+    "bangkok",
+    "cairo",
+    "athens",
+    "jerusalem",
+]
+
+
+def get_content(city_name):
+    content = str(wikipedia.page(city_name).content)
+    content = content.replace("\n\n", "\n")
+
+    tokens = t.encode(content)
+
+    expected_tokens = 3000
+    truncate_len = int((expected_tokens / len(tokens)) * len(content))
+    truncate_content = content[:truncate_len]
+    truncate_tokens = t.encode(truncate_content)
+
+    # Count token
+    print(
+        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
+    )
+
+    return truncate_content
+
+
+if __name__ == "__main__":
+    with open("questions.jsonl", "w") as fout:
+        for city_name in city_names:
+            truncate_content = get_content(city_name)
+            fout.write(json.dumps({"document": truncate_content}) + "\n")
--- a/benchmark/json_jump_forward/dataset.txt
+++ b/benchmark/json_jump_forward/dataset.txt
+Harry Potter
+Hermione Granger
+Ron Weasley
+Albus Dumbledore
+Severus Snape
+Rubeus Hagrid
+Draco Malfoy
+Ginny Weasley
+Fred Weasley
+George Weasley
+Percy Weasley
+Sirius Black
+Remus Lupin
+Neville Longbottom
+Luna Lovegood
+Cedric Diggory
+Cho Chang
+Lord Voldemort
+Minerva McGonagall
+Filius Flitwick
+Dolores Umbridge
+Bellatrix Lestrange
+Lucius Malfoy
+Molly Weasley
+Arthur Weasley
+Nymphadora Tonks
+Dobby
+Moaning Myrtle
+Peter Pettigrew
+Alastor 'Mad-Eye' Moody
+Horace Slughorn
+Vernon Dursley
+Petunia Dursley
+Dudley Dursley
+Argus Filch
+Sybill Trelawney
+Gilderoy Lockhart
+Fleur Delacour
+Viktor Krum
+Bill Weasley
+Oliver Wood
+Cornelius Fudge
+Barty Crouch Sr.
+Barty Crouch Jr.
+Kingsley Shacklebolt
+Quirinus Quirrell
+Nearly Headless Nick
+Aunt Marge
+Griphook
+Ludo Bagman
--- a/benchmark/json_schema/README.md
+++ b/benchmark/json_schema/README.md
+## Run benchmark
+
+### Benchmark sglang
+
+Run Llama-8b
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
+```
+
+Benchmark
+
+```bash
+python3 bench_sglang.py
+```
--- a/benchmark/json_schema/bench_sglang.py
+++ b/benchmark/json_schema/bench_sglang.py
+import argparse
+import json
+import time
+from typing import List, Tuple
+
+import jsonschema
+from datasets import load_dataset
+
+import sglang as sgl
+from sglang.global_config import global_config
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def schema_gen(s, message: Tuple[str, str], json_schema: str):
+    system, user = message
+    s += sgl.system(system)
+    s += sgl.user(user)
+    s += sgl.assistant(
+        sgl.gen("json_output", temperature=0, max_tokens=256, json_schema=json_schema)
+    )
+
+
+def contains_formats(schema, formats: List[str]):
+    if isinstance(schema, dict):
+        if schema.get("format", None) in formats:
+            return True
+        for value in schema.values():
+            if contains_formats(value, formats):
+                return True
+    elif isinstance(schema, list):
+        for item in schema:
+            if contains_formats(item, formats):
+                return True
+    return False
+
+
+def convert_dataset(path: str):
+    raw_dataset = load_dataset(path)
+    dataset = []
+    for data in raw_dataset["train"]:
+        messages = data["prompt"]
+        schema = data["schema"]
+        obj = json.loads(schema)
+
+        # skip some corrupted examples
+        if obj.get("type", None) is None:
+            continue
+
+        # skip schema with format "email"
+        # which is not supported by outlines for now
+        if contains_formats(obj, ["email"]):
+            continue
+
+        system = messages[0]
+        user = messages[1]
+        assert system["role"] == "system", "invalid role"
+        assert user["role"] == "user", "invalid role"
+        assert len(messages) == 2, "invalid message length"
+        message = json.dumps(system["content"]), json.dumps(user["content"])
+        dataset.append(
+            {
+                "message": message,
+                "json_schema": schema,
+            }
+        )
+
+    return dataset
+
+
+def bench_schema(args):
+    arguments = convert_dataset(args.data_path)
+
+    if args.num_jsons < 0 or args.num_jsons > len(arguments):
+        args.num_jsons = len(arguments)
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.perf_counter()
+    states = schema_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    # Check if the outputs are valid
+    indexes = []
+    for i, state in enumerate(states):
+        try:
+            schema = json.loads(arguments[i]["json_schema"])
+            obj = json.loads(state["json_output"])
+            assert jsonschema.validate(obj, schema) is None
+        except Exception as e:
+            print(e)
+            indexes.append(i)
+
+    return states, latency
+
+
+def main(args):
+    states, latency = bench_schema(args)
+
+    # Compute accuracy
+    tokenizer = get_tokenizer(
+        global_config.default_backend.get_server_info()["tokenizer_path"]
+    )
+    output_jsons = [state["json_output"] for state in states]
+    num_output_tokens = sum(len(tokenizer.encode(x)) for x in output_jsons)
+    print(f"Latency: {latency:.3f}")
+    print(f"Output throughput: {num_output_tokens / latency:.3f} token/s")
+    print(f"#output tokens: {num_output_tokens}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(f"{args.backend}.jsonl", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_schema",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="NousResearch/json-mode-eval")
+    parser.add_argument("--num-jsons", type=int, default=-1)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/kernels/all_reduce/benchmark_mscclpp.py
+++ b/benchmark/kernels/all_reduce/benchmark_mscclpp.py
+"""For Now, MSCCL is only supported on TP16 and TP8 case
+
+export WORLD_SIZE=1
+export RANK=0
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=12345
+
+torchrun --nproc_per_node gpu \
+--nnodes $WORLD_SIZE \
+--node_rank $RANK \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT benchmark/kernels/all_reduce/benchmark_mscclpp.py
+"""
+
+import os
+from contextlib import nullcontext
+from typing import List
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.distributed import init_distributed_environment
+from sglang.srt.distributed.device_communicators.pymscclpp import PyMscclppCommunicator
+from sglang.srt.distributed.device_communicators.pynccl import PyNcclCommunicator
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_group,
+    graph_capture,
+    initialize_model_parallel,
+    set_mscclpp_all_reduce,
+)
+
+
+def torch_allreduce(torch_input: torch.Tensor, group: ProcessGroup) -> torch.Tensor:
+    dist.all_reduce(torch_input, group=group)
+    return torch_input
+
+
+def msccl_allreduce(
+    msccl_input: torch.Tensor, msccl_comm: PyMscclppCommunicator
+) -> torch.Tensor:
+    return msccl_comm.all_reduce(msccl_input)
+
+
+def pynccl_allreduce(
+    msccl_input: torch.Tensor, pynccl_comm: PyNcclCommunicator
+) -> torch.Tensor:
+    pynccl_comm.all_reduce(msccl_input)
+    return msccl_input
+
+
+def _bench_graph_time(func, inp_randn, warmup_loop=2, graph_loop=10, test_loop=10):
+    graph_input = inp_randn.clone()
+    with graph_capture() as graph_capture_context:
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=graph_capture_context.stream):
+            for _ in range(graph_loop):
+                graph_out = func(graph_input)
+
+    graph.replay()
+    func_output = graph_out.clone()
+
+    for _ in range(warmup_loop):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: List[float] = []
+    for _ in range(test_loop):
+        torch.cuda.synchronize()
+        dist.barrier()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    func_cost_us = sum(latencies) / len(latencies) / graph_loop * 1000
+    graph.reset()
+    return func_output, func_cost_us
+
+
+def _bench_eager_time(func, inp_randn, warmup_loop=2, test_loop=10):
+    eager_input = inp_randn.clone()
+    eager_output = func(eager_input)
+    func_output = eager_output.clone()
+
+    for _ in range(warmup_loop):
+        func(eager_input)
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start_event.record()
+    for _ in range(test_loop):
+        func(eager_input)
+    end_event.record()
+    torch.cuda.synchronize()
+    func_cost_us = start_event.elapsed_time(end_event) / test_loop * 1000
+
+    return func_output, func_cost_us
+
+
+def get_torch_prof_ctx(do_prof: bool):
+    ctx = (
+        torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+            with_stack=True,
+        )
+        if do_prof
+        else nullcontext()
+    )
+    return ctx
+
+
+def human_readable_size(size, decimal_places=1):
+    for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+        if size < 1024.0 or unit == "PiB":
+            break
+        size /= 1024.0
+    return f"{size:.{decimal_places}f} {unit}"
+
+
+try:
+    from tabulate import tabulate
+except ImportError:
+    print("tabulate not installed, skipping table printing")
+    tabulate = None
+
+
+def print_markdown_table(data):
+    if tabulate is not None:
+        print(tabulate(data, headers="keys", tablefmt="github"))
+        return
+    headers = data[0].keys()
+    header_row = "| " + " | ".join(headers) + " |"
+    separator = "| " + " | ".join(["---"] * len(headers)) + " |"
+    rows = []
+    for item in data:
+        row = "| " + " | ".join(str(item[key]) for key in headers) + " |"
+        rows.append(row)
+    markdown_table = "\n".join([header_row, separator] + rows)
+    print(markdown_table)
+
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        force=True,
+    )
+    if not dist.is_initialized():
+        dist.init_process_group(backend="nccl")
+    world, world_size = dist.group.WORLD, dist.get_world_size()
+    rank = dist.get_rank()
+    torch.cuda.set_device(rank % 8)
+    device = torch.cuda.current_device()
+    set_mscclpp_all_reduce(True)
+    init_distributed_environment(
+        world_size=world_size,
+        rank=rank,
+        local_rank=rank % 8,
+    )
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    group = get_tensor_model_parallel_group().device_group
+    cpu_group = get_tensor_model_parallel_group().cpu_group
+    pynccl_comm = get_tensor_model_parallel_group().pynccl_comm
+    pymscclpp_comm = get_tensor_model_parallel_group().pymscclpp_comm
+    dist.barrier()
+    profile = False
+    dtype = torch.bfloat16
+    ctx = get_torch_prof_ctx(profile)
+    result = []
+
+    with ctx:
+        for i in range(10, 20):
+            sz = 2**i
+            if sz * dtype.itemsize > 2**20:
+                break
+            inp_randn = torch.randint(1, 16, (sz,), dtype=dtype, device=device)
+
+            memory = torch.empty_like(inp_randn)
+            memory_out = torch.empty_like(memory)
+            torch_eager_output, torch_eager_time = _bench_eager_time(
+                lambda inp: torch_allreduce(inp, group), inp_randn
+            )
+            msccl_eager_output, msccl_eager_time = _bench_eager_time(
+                lambda inp: msccl_allreduce(inp, pymscclpp_comm), inp_randn
+            )
+            msccl_graph_output, msccl_graph_time = _bench_graph_time(
+                lambda inp: msccl_allreduce(inp, pymscclpp_comm), inp_randn
+            )
+            # since pynccl is inplace op, this return result is not correct if graph loop > 1
+            _, pynccl_graph_time = _bench_graph_time(
+                lambda inp: pynccl_allreduce(inp, pynccl_comm), inp_randn
+            )
+            torch.testing.assert_close(torch_eager_output, msccl_graph_output)
+            torch.testing.assert_close(torch_eager_output, msccl_eager_output)
+            result.append(
+                {
+                    "msg_size": human_readable_size(inp_randn.nbytes),
+                    "torch eager time": torch_eager_time,
+                    "msccl eager time": msccl_eager_time,
+                    "msccl graph time": msccl_graph_time,
+                    "pynccl graph time": pynccl_graph_time,
+                }
+            )
+            if rank == 0:
+                print(f"sz={sz}, dtype={dtype}: correctness check PASS!")
+    if rank == 0:
+        print_markdown_table(result)
+    if profile:
+        prof_dir = f"prof/msccl"
+        os.makedirs(prof_dir, exist_ok=True)
+        ctx.export_chrome_trace(f"{prof_dir}/trace_rank{dist.get_rank()}.json.gz")
--- a/benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py
+++ b/benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py
+import itertools
+import math
+
+import cudnn
+import torch
+import torch.utils.benchmark as benchmark
+from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import decode_attention_fwd
+from sglang.srt.utils import should_use_tensor_core
+
+
+def benchmark_forward(
+    fn,
+    *inputs,
+    repeats=10,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            fn(*inputs, **kwinputs)
+
+    t = benchmark.Timer(
+        stmt="fn_amp(*inputs, **kwinputs)",
+        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    return t, m
+
+
+def time_fwd(func, *args, **kwargs):
+    time_f = benchmark_forward(func, *args, **kwargs)
+    return time_f[1].mean * 1e6
+
+
+def decode_attention_sglang(
+    q,
+    kv_data,
+    batch_size,
+    kv_len,
+    head_num_q,
+    head_num_kv,
+    head_dim,
+    num_kv_splits,
+    warmup=10,
+):
+
+    k_buffer = kv_data[0].view(-1, head_num_kv, head_dim)
+    v_buffer = kv_data[1].view(-1, head_num_kv, head_dim)
+    o = torch.empty_like(q)
+    total_tokens = batch_size * kv_len
+    req_to_token = torch.arange(0, total_tokens).to(0).int().view(batch_size, kv_len)
+    b_req_idx = torch.arange(0, batch_size).to(0).int()
+    b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32, device="cuda")
+    max_len_in_batch = kv_len
+    sm_scale = 1.0 / (head_dim**0.5)
+
+    attn_logits = torch.empty(
+        (batch_size, head_num_q, num_kv_splits, head_dim + 1),
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    for _ in range(warmup):
+        decode_attention_fwd(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            req_to_token,
+            b_req_idx,
+            b_seq_len,
+            attn_logits,
+            num_kv_splits,
+            sm_scale,
+        )
+
+    f = time_fwd(
+        decode_attention_fwd,
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        req_to_token,
+        b_req_idx,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+    )
+
+    return f, o
+
+
+def decode_attention_flashinfer(dtype, head_num_q, head_num_kv):
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device="cuda")
+    use_tensor_cores = should_use_tensor_core(
+        kv_cache_dtype=dtype,
+        num_attention_heads=head_num_q,
+        num_kv_heads=head_num_kv,
+    )
+    flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+        workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores
+    )
+
+    class FlashinferAttention(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            q,
+            kv_data,
+            batch_size,
+            kv_len,
+            head_num_q,
+            head_num_kv,
+            head_dim,
+            dtype,
+            warmup=10,
+        ):
+            total_tokens = batch_size * kv_len
+            kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
+            kv_indices = torch.arange(0, total_tokens).to(0).int()
+            kv_last_page_len = torch.full(
+                (batch_size,), 1, dtype=torch.int32, device="cuda"
+            )
+
+            flashinfer_decode_wrapper.end_forward()
+            flashinfer_decode_wrapper.begin_forward(
+                kv_indptr,
+                kv_indices,
+                kv_last_page_len,
+                head_num_q,
+                head_num_kv,
+                head_dim,
+                1,
+                pos_encoding_mode="NONE",
+                data_type=dtype,
+            )
+
+            for _ in range(warmup):
+                o = flashinfer_decode_wrapper.forward(
+                    q.contiguous().view(-1, head_num_q, head_dim), kv_data
+                )
+
+            f = time_fwd(
+                flashinfer_decode_wrapper.forward,
+                q.contiguous().view(-1, head_num_q, head_dim),
+                kv_data,
+            )
+
+            return f, o
+
+    return FlashinferAttention
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+def decode_attention_cudnn(
+    q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype, warmup=10
+):
+    # Prepare data: continuous q,k,v
+    dims_q = (batch_size, head_num_q, 1, head_dim)
+    strides_q = (head_num_q * head_dim, head_dim, head_num_q * head_dim, 1)
+    q_gpu = q.as_strided(dims_q, strides_q)
+    o_gpu = (
+        torch.empty(batch_size * head_num_q * head_dim)
+        .half()
+        .cuda()
+        .as_strided(dims_q, strides_q)
+    )
+
+    dims_kv = (batch_size, head_num_kv, kv_len, head_dim)
+    strides_kv = (
+        kv_len * head_num_kv * head_dim,
+        head_dim,
+        head_num_kv * head_dim,
+        1,
+    )
+    k_gpu = kv_data[0].as_strided(dims_kv, strides_kv)
+    v_gpu = kv_data[1].as_strided(dims_kv, strides_kv)
+
+    seq_len_q_gpu = torch.full((batch_size, 1, 1, 1), 1, device="cuda")
+    seq_len_kv_gpu = torch.full((batch_size, 1, 1, 1), kv_len, device="cuda")
+    attn_scale = 1.0 / (head_dim**0.5)
+
+    # Prepare data: paged k,v
+    block_size = 1
+    blocks_per_batch = math.ceil(kv_len / block_size)
+    # [num_blocks, head_num_kv, block_size, head_dim], num_blocks = batch_size * blocks_per_batch
+    container_k_gpu = torch.cat(k_gpu.chunk(blocks_per_batch, dim=2), dim=0)
+    container_v_gpu = torch.cat(v_gpu.chunk(blocks_per_batch, dim=2), dim=0)
+    page_table_k_gpu = (
+        torch.linspace(
+            0,
+            batch_size * blocks_per_batch - 1,
+            batch_size * blocks_per_batch,
+            device="cuda",
+            dtype=torch.int32,
+        )
+        .reshape(blocks_per_batch, 1, batch_size, 1)
+        .transpose(0, 2)
+    )
+    page_table_v_gpu = page_table_k_gpu.clone()
+
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(dtype),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    q = graph.tensor_like(q_gpu)
+    container_k = graph.tensor_like(container_k_gpu)
+    container_v = graph.tensor_like(container_v_gpu)
+    page_table_k = graph.tensor_like(page_table_k_gpu)
+    page_table_v = graph.tensor_like(page_table_v_gpu)
+
+    seq_len_q = graph.tensor_like(seq_len_q_gpu)
+    seq_len_kv = graph.tensor_like(seq_len_kv_gpu)
+
+    o, _ = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=container_k,  # Container K: non contiguous container with K blocks
+        v=container_v,  # Container V: non contiguous container with V blocks
+        is_inference=True,
+        attn_scale=attn_scale,
+        use_causal_mask=False,
+        use_padding_mask=True,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        paged_attention_k_table=page_table_k,  # Page Table K: Tensor containing offsets to the container with K blocks
+        paged_attention_v_table=page_table_v,  # Page Table V: Tensor containing offsets to the container with V blocks
+        paged_attention_max_seq_len_kv=kv_len,  # The maximum sequence length for K caches (this is optional, but recommended)
+    )
+
+    o.set_output(True).set_dim(dims_q).set_stride(strides_q)
+
+    graph.validate()
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A])
+    graph.check_support()
+    graph.build_plans()
+
+    workspace = torch.empty(
+        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+    )
+
+    variant_pack = {
+        q: q_gpu,
+        container_k: container_k_gpu,
+        container_v: container_v_gpu,
+        page_table_k: page_table_k_gpu,
+        page_table_v: page_table_v_gpu,
+        seq_len_q: seq_len_q_gpu,
+        seq_len_kv: seq_len_kv_gpu,
+        o: o_gpu,
+    }
+
+    for _ in range(warmup):
+        graph.execute(variant_pack, workspace)
+
+    f = time_fwd(
+        graph.execute,
+        variant_pack,
+        workspace,
+    )
+
+    return f, o_gpu.squeeze(dim=2)
+
+
+def calculate_diff():
+
+    dtype = torch.float16
+    batch_size = 64
+    kv_len = 4096
+    head_num_q = 64
+    head_num_kv = 8
+    head_dim = 128
+
+    q = torch.randn(batch_size, head_num_q, head_dim, dtype=dtype, device="cuda")
+    kv_data = (
+        torch.randn(
+            batch_size * kv_len, head_num_kv, head_dim, dtype=dtype, device="cuda"
+        ),
+        torch.randn(
+            batch_size * kv_len, head_num_kv, head_dim, dtype=dtype, device="cuda"
+        ),
+    )
+
+    _, output_sglang = decode_attention_sglang(
+        q,
+        kv_data,
+        batch_size,
+        kv_len,
+        head_num_q,
+        head_num_kv,
+        head_dim,
+        num_kv_splits=8,
+    )
+
+    attn_flashinfer = decode_attention_flashinfer(dtype, head_num_q, head_num_kv).apply
+    _, output_flashinfer = attn_flashinfer(
+        q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+    )
+
+    _, output_cudnn = decode_attention_cudnn(
+        q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+    )
+
+    print(f"SGLang output={output_sglang}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"cuDNN output={output_cudnn}")
+    if torch.allclose(output_sglang, output_flashinfer, atol=1e-2, rtol=1e-2):
+        print("✅ SGLang[Triton] and FlashInfer match")
+    else:
+        print("❌ SGLang[Triton] and FlashInfer differ")
+
+    if torch.allclose(output_sglang, output_cudnn, atol=1e-2, rtol=1e-2):
+        print("✅ SGLang[Triton] and cuDNN match")
+    else:
+        print("❌ SGLang[Triton] and cuDNN differ")
+
+
+if __name__ == "__main__":
+    calculate_diff()
+
+    head_dim = 128
+    dtype = torch.float16
+    batch_size_range = [2**i for i in range(0, 8, 2)]
+    kv_len_range = [2**i for i in range(6, 13, 1)]
+    configs = list(itertools.product(batch_size_range, kv_len_range))
+
+    for head_num_q, head_num_kv in [[32, 32], [64, 8], [40, 8]]:
+        attn_flashinfer = decode_attention_flashinfer(
+            dtype, head_num_q, head_num_kv
+        ).apply
+        for batch_size, kv_len in configs:
+            q = torch.randn(
+                batch_size, head_num_q, head_dim, dtype=dtype, device="cuda"
+            )
+            kv_data = (
+                torch.randn(
+                    batch_size * kv_len,
+                    head_num_kv,
+                    head_dim,
+                    dtype=dtype,
+                    device="cuda",
+                ),
+                torch.randn(
+                    batch_size * kv_len,
+                    head_num_kv,
+                    head_dim,
+                    dtype=dtype,
+                    device="cuda",
+                ),
+            )
+            us_cudnn, output_cudnn = decode_attention_cudnn(
+                q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+            )
+            us_sglang, output_sglang = decode_attention_sglang(
+                q,
+                kv_data,
+                batch_size,
+                kv_len,
+                head_num_q,
+                head_num_kv,
+                head_dim,
+                num_kv_splits=8,
+            )
+            us_flashinfer, _ = attn_flashinfer(
+                q, kv_data, batch_size, kv_len, head_num_q, head_num_kv, head_dim, dtype
+            )
+            print(
+                head_num_q,
+                "  ",
+                head_num_kv,
+                "  ",
+                batch_size,
+                "  ",
+                kv_len,
+                "  ",
+                us_cudnn,
+                "  ",
+                us_sglang,
+                "  ",
+                us_flashinfer,
+            )