release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

release initial code
Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
22085081 · Lianmin Zheng · f6d40df0 · 22085081 · 22085081 · 22085081
Commit 22085081 authored Jan 08, 2024 by Lianmin Zheng
20 changed files
--- a/python/sglang/srt/sampling_params.py
+++ b/python/sglang/srt/sampling_params.py
+"""Sampling parameters for text generation."""
+from typing import List, Optional, Union
+
+_SAMPLING_EPS = 1e-6
+
+
+class SamplingParams:
+    def __init__(
+        self,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        stop: Optional[Union[str, List[str]]] = None,
+        max_new_tokens: int = 16,
+        ignore_eos: bool = False,
+        skip_special_tokens: bool = True,
+        dtype: Optional[str] = None,
+        regex: Optional[str] = None,
+    ) -> None:
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.stop_strs = stop
+        self.max_new_tokens = max_new_tokens
+        self.ignore_eos = ignore_eos
+        self.skip_special_tokens = skip_special_tokens
+        self.dtype = dtype
+        self.regex = regex
+
+        # Process some special cases
+        if self.temperature < _SAMPLING_EPS:
+            self.temperature = 1.0
+            self.top_k = 1
+        if self.top_k == -1:
+            self.top_k = 1 << 30  # whole vocabulary
+        if self.dtype == "int":
+            self.stop_strs = [" ", "\n"]
+
+    def verify(self):
+        if self.temperature < 0.0:
+            raise ValueError(
+                f"temperature must be non-negative, got {self.temperature}."
+            )
+        if not 0.0 < self.top_p <= 1.0:
+            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        if self.top_k < -1 or self.top_k == 0:
+            raise ValueError(
+                f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
+            )
+        if not -2.0 <= self.frequency_penalty <= 2.0:
+            raise ValueError(
+                "frequency_penalty must be in [-2, 2], got "
+                f"{self.frequency_penalty}."
+            )
+        if not -2.0 <= self.presence_penalty <= 2.0:
+            raise ValueError(
+                "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
+            )
+        if self.max_new_tokens < 0:
+            raise ValueError(
+                f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
+            )
+
+    def normalize(self, tokenizer):
+        # Process stop strings
+        if self.stop_strs is None:
+            self.stop_strs = []
+            self.stop_str_max_len = 0
+        else:
+            if isinstance(self.stop_strs, str):
+                self.stop_strs = [self.stop_strs]
+
+            stop_str_max_len = 0
+            for stop_str in self.stop_strs:
+                stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
+                stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+            self.stop_str_max_len = stop_str_max_len
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
+"""SRT: SGLang Runtime"""
+import argparse
+import asyncio
+import dataclasses
+import json
+import multiprocessing as mp
+import sys
+import threading
+import time
+from typing import List, Optional
+
+# Fix a Python bug
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+import psutil
+import requests
+import uvicorn
+import uvloop
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.openai_protocol import CompletionRequest
+from sglang.srt.managers.router.manager import start_router_process
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import alloc_usable_network_port
+
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+
+app = FastAPI()
+tokenizer_manager = None
+
+
+@app.get("/get_model_info")
+async def get_model_info():
+    result = {
+        "model_path": tokenizer_manager.model_path,
+    }
+    return result
+
+
+@app.post("/generate")
+async def generate_request(obj: GenerateReqInput):
+    obj.post_init()
+    result_generator = tokenizer_manager.generate_request(obj)
+
+    if obj.stream:
+
+        async def stream_results():
+            async for out in result_generator:
+                yield (json.dumps(out) + "\0").encode("utf-8")
+
+        return StreamingResponse(stream_results(), media_type="text/event-stream")
+    else:
+        ret = await result_generator.__anext__()
+        return ret
+
+
+@app.post("/v1/completions")
+async def v1_completions(obj: CompletionRequest):
+    assert obj.n == 1
+    obj = GenerateReqInput(
+        text=obj.prompt,
+        sampling_params={
+            "temperature": obj.temperature,
+            "max_new_tokens": obj.max_tokens,
+            "stop": obj.stop,
+        },
+    )
+    ret = await generate_request(obj)
+    return {
+        "choices": [{"text": ret["text"]}],
+    }
+
+
+def launch_server(server_args, pipe_finish_writer):
+    global tokenizer_manager
+
+    # Allocate ports
+    can_use_ports = alloc_usable_network_port(
+        num=4 + server_args.tp_size, used_list=(server_args.port,)
+    )
+    port_args = PortArgs(
+        tokenizer_port=can_use_ports[0],
+        router_port=can_use_ports[1],
+        detokenizer_port=can_use_ports[2],
+        nccl_port=can_use_ports[3],
+        model_rpc_ports=can_use_ports[4:],
+    )
+
+    # Launch processes
+    tokenizer_manager = TokenizerManager(server_args, port_args)
+    pipe_router_reader, pipe_router_writer = mp.Pipe(duplex=False)
+    pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
+
+    proc_router = mp.Process(
+        target=start_router_process,
+        args=(
+            server_args,
+            port_args,
+            pipe_router_writer,
+        ),
+    )
+    proc_router.start()
+    proc_detoken = mp.Process(
+        target=start_detokenizer_process,
+        args=(
+            server_args,
+            port_args,
+            pipe_detoken_writer,
+        ),
+    )
+    proc_detoken.start()
+
+    # Wait for the model to finish loading
+    router_init_state = pipe_router_reader.recv()
+    detoken_init_state = pipe_detoken_reader.recv()
+
+    if router_init_state != "init ok" or detoken_init_state != "init ok":
+        proc_router.kill()
+        proc_detoken.kill()
+        print("router init state:", router_init_state)
+        print("detoken init state:", detoken_init_state)
+        sys.exit(1)
+
+    assert proc_router.is_alive() and proc_detoken.is_alive()
+
+    def launch_server():
+        # Launch api server
+        uvicorn.run(
+            app,
+            host=server_args.host,
+            port=server_args.port,
+            log_level=server_args.log_level,
+            timeout_keep_alive=5,
+            loop="uvloop",
+        )
+
+    t = threading.Thread(target=launch_server)
+    t.start()
+
+    if pipe_finish_writer:
+        url = server_args.url()
+
+        success = False
+        for i in range(60):
+            try:
+                res = requests.get(url + "/get_model_info", timeout=5)
+                success = True
+                break
+            except requests.exceptions.RequestException as e:
+                time.sleep(1)
+
+        if success:
+            pipe_finish_writer.send("init ok")
+        else:
+            pipe_finish_writer.send(str(e))
+
+
+class Runtime:
+    def __init__(
+        self,
+        model_path: str,
+        tokenizer_path: Optional[str] = None,
+        load_format: str = "auto",
+        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = True,
+        mem_fraction_static: float = 0.9,
+        tp_size: int = 1,
+        model_mode: List[str] = (),
+        schedule_heuristic: str = "lpm",
+        random_seed: int = 42,
+        log_level: str = "warning",
+    ):
+        host = "127.0.0.1"
+        port = alloc_usable_network_port(1)[0]
+        server_args = ServerArgs(
+            model_path=model_path,
+            tokenizer_path=tokenizer_path,
+            host=host,
+            port=port,
+            load_format=load_format,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
+            mem_fraction_static=mem_fraction_static,
+            tp_size=tp_size,
+            model_mode=model_mode,
+            schedule_heuristic=schedule_heuristic,
+            random_seed=random_seed,
+            log_level=log_level,
+        )
+        self.url = server_args.url()
+
+        self.pid = None
+        pipe_reader, pipe_writer = mp.Pipe(duplex=False)
+        proc = mp.Process(target=launch_server, args=(server_args, pipe_writer))
+        proc.start()
+        self.pid = proc.pid
+
+        init_state = pipe_reader.recv()
+        if init_state != "init ok":
+            self.shutdown()
+            raise RuntimeError("Launch failed")
+
+        self.endpoint = RuntimeEndpoint(self.url)
+
+    def shutdown(self):
+        if self.pid is not None:
+            parent = psutil.Process(self.pid)
+            children = parent.children(recursive=True)
+            for child in children:
+                child.kill()
+            psutil.wait_procs(children, timeout=5)
+            parent.kill()
+            parent.wait(timeout=5)
+            self.pid = None
+
+    def __del__(self):
+        self.shutdown()
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
+import argparse
+import dataclasses
+from typing import List, Optional
+
+
+@dataclasses.dataclass
+class ServerArgs:
+    model_path: str
+    tokenizer_path: Optional[str] = None
+    host: str = "127.0.0.1"
+    port: int = 30000
+    load_format: str = "auto"
+    tokenizer_mode: str = "auto"
+    trust_remote_code: bool = True
+    mem_fraction_static: float = 0.91
+    tp_size: int = 1
+    model_mode: List[str] = ()
+    schedule_heuristic: str = "lpm"
+    random_seed: int = 42
+    disable_log_stats: bool = False
+    log_stats_interval: int = 10
+    log_level: str = "info"
+
+    def __post_init__(self):
+        if self.tokenizer_path is None:
+            self.tokenizer_path = self.model_path
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "--model-path",
+            type=str,
+            help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
+            required=True,
+        )
+        parser.add_argument(
+            "--tokenizer-path",
+            type=str,
+            default=ServerArgs.tokenizer_path,
+            help="The path of the tokenizer.",
+        )
+        parser.add_argument("--host", type=str, default=ServerArgs.host)
+        parser.add_argument("--port", type=int, default=ServerArgs.port)
+        parser.add_argument(
+            "--load-format",
+            type=str,
+            default=ServerArgs.load_format,
+            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
+            help="The format of the model weights to load. "
+            '"auto" will try to load the weights in the safetensors format '
+            "and fall back to the pytorch bin format if safetensors format "
+            "is not available. "
+            '"pt" will load the weights in the pytorch bin format. '
+            '"safetensors" will load the weights in the safetensors format. '
+            '"npcache" will load the weights in pytorch format and store '
+            "a numpy cache to speed up the loading. "
+            '"dummy" will initialize the weights with random values, '
+            "which is mainly for profiling.",
+        )
+        parser.add_argument(
+            "--tokenizer-mode",
+            type=str,
+            default=ServerArgs.tokenizer_mode,
+            choices=["auto", "slow"],
+            help="Tokenizer mode. 'auto' will use the fast "
+            "tokenizer if available, and 'slow' will "
+            "always use the slow tokenizer.",
+        )
+        parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
+        )
+        parser.add_argument(
+            "--mem-fraction-static",
+            type=float,
+            default=ServerArgs.mem_fraction_static,
+            help="The fraction of the memory used for static allocation (model weights and KV cache memory pool)",
+        )
+        parser.add_argument(
+            "--tp-size",
+            type=int,
+            default=ServerArgs.tp_size,
+            help="Tensor parallelism degree.",
+        )
+        parser.add_argument(
+            "--model-mode",
+            type=str,
+            default=[],
+            nargs="+",
+            help="Model mode: [flashinfer, no-cache, aggressive-new-fill]",
+        )
+        parser.add_argument(
+            "--schedule-heuristic",
+            type=str,
+            default=ServerArgs.schedule_heuristic,
+            help="Schudule mode: [lpm, weight, random, fcfs]",
+        )
+        parser.add_argument(
+            "--random-seed",
+            type=int,
+            default=ServerArgs.random_seed,
+            help="Random seed.",
+        )
+        parser.add_argument(
+            "--log-level",
+            type=str,
+            default=ServerArgs.log_level,
+            help="Log level",
+        )
+        parser.add_argument(
+            "--disable-log-stats",
+            action="store_true",
+            help="Disable logging throughput stats.",
+        )
+        parser.add_argument(
+            "--log-stats-interval",
+            type=int,
+            default=ServerArgs.log_stats_interval,
+            help="Log stats interval in second.",
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
+
+    def url(self):
+        return f"http://{self.host}:{self.port}"
+
+
+@dataclasses.dataclass
+class PortArgs:
+    tokenizer_port: int
+    router_port: int
+    detokenizer_port: int
+    nccl_port: int
+    model_rpc_ports: List[int]
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
+import base64
+import os
+import random
+import socket
+import sys
+import time
+import traceback
+from io import BytesIO
+
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+
+is_show_cost_time = False
+
+
+def mark_cost_time(func_name):
+    def inner_func(func):
+        def time_func(*args, **kwargs):
+            if dist.get_rank() in [0, 1] and is_show_cost_time:
+                torch.cuda.synchronize()
+                start_time = time.time()
+                ans = func(*args, **kwargs)
+                torch.cuda.synchronize()
+                print(func_name, "cost time:", (time.time() - start_time) * 1000)
+                return ans
+            else:
+                torch.cuda.synchronize()
+                ans = func(*args, **kwargs)
+                torch.cuda.synchronize()
+                return ans
+
+        return time_func
+
+    return inner_func
+
+
+time_mark = {}
+
+
+def mark_start(key):
+    torch.cuda.synchronize()
+    global time_mark
+    time_mark[key] = time.time()
+    return
+
+
+def mark_end(key, print_min_cost=0.0):
+    torch.cuda.synchronize()
+    global time_mark
+    cost_time = (time.time() - time_mark[key]) * 1000
+    if cost_time > print_min_cost:
+        print(f"cost {key}:", cost_time)
+
+
+def calculate_time(show=False, min_cost_ms=0.0):
+    def wrapper(func):
+        def inner_func(*args, **kwargs):
+            torch.cuda.synchronize()
+            if show:
+                start_time = time.time()
+            result = func(*args, **kwargs)
+            torch.cuda.synchronize()
+            if show:
+                cost_time = (time.time() - start_time) * 1000
+                if cost_time > min_cost_ms:
+                    print(f"Function {func.__name__} took {cost_time} ms to run.")
+            return result
+
+        return inner_func
+
+    return wrapper
+
+
+def set_random_seed(seed: int) -> None:
+    random.seed(seed)
+
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def alloc_usable_network_port(num, used_list=()):
+    port_list = []
+    for port in range(10000, 65536):
+        if port in used_list:
+            continue
+
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind(("", port))
+                port_list.append(port)
+            except socket.error:
+                pass
+
+            if len(port_list) == num:
+                return port_list
+    return None
+
+
+def get_exception_traceback():
+    etype, value, tb = sys.exc_info()
+    err_str = "".join(traceback.format_exception(etype, value, tb))
+    return err_str
+
+
+def get_int_token_logit_bias(tokenizer, vocab_size):
+    from transformers import LlamaTokenizer, LlamaTokenizerFast
+
+    logit_bias = np.zeros(vocab_size, dtype=np.float32)
+    for t_id in range(vocab_size):
+        ss = tokenizer.decode(t_id).strip()
+        if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id):
+            logit_bias[t_id] = -1e5
+        # else:
+        #    print(ss, t_id)
+
+    return logit_bias
+
+
+def wrap_kernel_launcher(kernel):
+    """A faster launcher for triton kernels."""
+    import torch.distributed as dist
+
+    if dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+
+    kernels = kernel.cache[rank].values()
+    kernel = next(iter(kernels))
+
+    # Different trition versions use different low-level names
+    if hasattr(kernel, "cu_function"):
+        kfunction = kernel.cu_function
+    else:
+        kfunction = kernel.function
+
+    if hasattr(kernel, "c_wrapper"):
+        run = kernel.c_wrapper
+    else:
+        run = kernel.run
+
+    add_cluster_dim = True
+
+    def ret_func(grid, num_warps, *args):
+        nonlocal add_cluster_dim
+
+        try:
+            if add_cluster_dim:
+                run(
+                    grid[0],
+                    grid[1],
+                    grid[2],
+                    num_warps,
+                    1,
+                    1,
+                    1,
+                    1,
+                    kernel.shared,
+                    0,
+                    kfunction,
+                    None,
+                    None,
+                    kernel,
+                    *args,
+                )
+            else:
+                run(
+                    grid[0],
+                    grid[1],
+                    grid[2],
+                    num_warps,
+                    kernel.shared,
+                    0,
+                    kfunction,
+                    None,
+                    None,
+                    kernel,
+                    *args,
+                )
+        except TypeError:
+            add_cluster_dim = not add_cluster_dim
+            ret_func(grid, num_warps, *args)
+
+    return ret_func
+
+
+def is_multimodal_model(model):
+    if isinstance(model, str):
+        return "llava" in model
+    from sglang.srt.model_config import ModelConfig
+
+    if isinstance(model, ModelConfig):
+        return "llava" in model.path.lower()
+    raise Exception("unrecognized type")
+
+
+def load_image(image_file):
+    from PIL import Image
+
+    image = None
+
+    if image_file.startswith("http://") or image_file.startswith("https://"):
+        timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
+        response = requests.get(image_file, timeout=timeout)
+        image = Image.open(BytesIO(response.content))
+    elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
+        image = Image.open(image_file)
+    elif image_file.startswith("data:"):
+        image_file = image_url.split(",")[1]
+        image = Image.open(BytesIO(base64.b64decode(image_file)))
+    else:
+        image = Image.open(BytesIO(base64.b64decode(image_file)))
+
+    return image
--- a/python/sglang/test/test_programs.py
+++ b/python/sglang/test/test_programs.py
+"""
+This file contains the SGL programs used for unit testing.
+"""
+
+import json
+import re
+
+import sglang as sgl
+
+
+def test_few_shot_qa():
+    @sgl.function
+    def few_shot_qa(s, question):
+        s += "The following are questions with answers.\n\n"
+        s += "Q: What is the capital of France?\n"
+        s += "A: Paris\n"
+        s += "Q: What is the capital of Germany?\n"
+        s += "A: Berlin\n"
+        s += "Q: What is the capital of Italy?\n"
+        s += "A: Rome\n"
+        s += "Q: " + question + "\n"
+        s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+    ret = few_shot_qa.run(question="What is the capital of the United States?")
+    assert "washington" in ret["answer"].strip().lower(), f"answer: {ret['answer']}"
+
+    rets = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of Japan?"},
+            {"question": "What is the capital of the United Kingdom?"},
+            {"question": "What is the capital city of China?"},
+        ],
+        temperature=0.1,
+    )
+    answers = [x["answer"].strip().lower() for x in rets]
+    assert answers == ["tokyo", "london", "beijing"], f"answers: {answers}"
+
+
+def test_mt_bench():
+    @sgl.function
+    def answer_mt_bench(s, question_1, question_2):
+        s += sgl.system("You are a helpful assistant.")
+        s += sgl.user(question_1)
+        s += sgl.assistant(sgl.gen("answer_1"))
+        with s.user():
+            s += question_2
+        with s.assistant():
+            s += sgl.gen("answer_2")
+
+    question_1 = "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions."
+    question_2 = (
+        "Rewrite your previous response. Start every sentence with the letter A."
+    )
+    ret = answer_mt_bench.run(
+        question_1=question_1, question_2=question_2, temperature=0.7, max_new_tokens=64
+    )
+    assert len(ret.messages()) in [4, 5]
+
+
+def test_select(check_answer):
+    @sgl.function
+    def true_or_false(s, statement):
+        s += "Determine whether the statement below is True, False, or Unknown.\n"
+        s += "Statement: The capital of France is Pairs.\n"
+        s += "Answer: True\n"
+        s += "Statement: " + statement + "\n"
+        s += "Answer:" + sgl.select("answer", ["True", "False", "Unknown"])
+
+    ret = true_or_false.run(
+        statement="The capital of Germany is Berlin.",
+    )
+    if check_answer:
+        assert ret["answer"] == "True", ret.text
+    else:
+        assert ret["answer"] in ["True", "False", "Unknown"]
+
+    ret = true_or_false.run(
+        statement="The capital of Canada is Tokyo.",
+    )
+    if check_answer:
+        assert ret["answer"] == "False", ret.text
+    else:
+        assert ret["answer"] in ["True", "False", "Unknown"]
+
+    ret = true_or_false.run(
+        statement="Purple is a better color than green.",
+    )
+    if check_answer:
+        assert ret["answer"] == "Unknown", ret.text
+    else:
+        assert ret["answer"] in ["True", "False", "Unknown"]
+
+
+def test_decode_int():
+    @sgl.function
+    def decode_int(s):
+        s += "The number of hours in a day is " + sgl.gen_int("hours") + "\n"
+        s += "The number of days in a year is " + sgl.gen_int("days") + "\n"
+
+    ret = decode_int.run(temperature=0.1)
+    assert int(ret["hours"]) == 24, ret.text
+    assert int(ret["days"]) == 365, ret.text
+
+
+def test_decode_json():
+    @sgl.function
+    def decode_json(s):
+        s += "Generate a JSON object to describe the basic information of a city.\n"
+
+        with s.var_scope("json_output"):
+            s += "{\n"
+            s += '  "name": ' + sgl.gen_string() + ",\n"
+            s += '  "population": ' + sgl.gen_int() + ",\n"
+            s += '  "area": ' + sgl.gen(dtype=int) + ",\n"
+            s += '  "country": ' + sgl.gen_string() + ",\n"
+            s += '  "timezone": ' + sgl.gen(dtype=str) + "\n"
+            s += "}"
+
+    ret = decode_json.run()
+    js_obj = json.loads(ret["json_output"])
+    assert isinstance(js_obj["name"], str)
+    assert isinstance(js_obj["population"], int)
+
+
+def test_expert_answer():
+    @sgl.function
+    def expert_answer(s, question):
+        s += "Question: " + question + "\n"
+        s += (
+            "A good person to answer this question is"
+            + sgl.gen("expert", stop=[".", "\n"])
+            + ".\n"
+        )
+        s += (
+            "For example,"
+            + s["expert"]
+            + " would answer that "
+            + sgl.gen("answer", stop=".")
+            + "."
+        )
+
+    ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
+    assert "paris" in ret.text().lower()
+
+
+def test_tool_use():
+    def calculate(expression):
+        return f"{eval(expression)}"
+
+    @sgl.function
+    def tool_use(s, lhs, rhs):
+        s += "Please perform computations using a calculator. You can use calculate(expression) to get the results.\n"
+        s += "For example,\ncalculate(1+2)=3\ncalculate(3*4)=12\n"
+        s += "Question: What is the product of " + lhs + " and " + rhs + "?\n"
+        s += (
+            "Answer: The answer is calculate("
+            + sgl.gen("expression", stop=")")
+            + ") = "
+        )
+        with s.var_scope("answer"):
+            s += calculate(s["expression"])
+
+    lhs, rhs = 257, 983
+    ret = tool_use(lhs=lhs, rhs=rhs, temperature=0)
+    assert int(ret["answer"]) == lhs * rhs
+
+
+def test_react():
+    @sgl.function
+    def react(s, question):
+        s += """
+Question: Which country does the founder of Microsoft live in?
+Thought 1: I need to search for the founder of Microsoft.
+Action 1: Search [Founder of Microsoft].
+Observation 1: The founder of Microsoft is Bill Gates.
+Thought 2: I need to search for the country where Bill Gates lives in.
+Action 2: Search [Where does Bill Gates live].
+Observation 2: Bill Gates lives in the United States.
+Thought 3: The answer is the United States.
+Action 3: Finish [United States].\n
+"""
+
+        s += "Question: " + question + "\n"
+
+        for i in range(1, 5):
+            s += f"Thought {i}:" + sgl.gen(stop=[".", "\n"]) + ".\n"
+            s += f"Action {i}: " + sgl.select(f"action_{i}", ["Search", "Finish"])
+
+            if s[f"action_{i}"] == "Search":
+                s += " [" + sgl.gen(stop="]") + "].\n"
+                s += f"Observation {i}:" + sgl.gen(stop=[".", "\n"]) + ".\n"
+            else:
+                s += " [" + sgl.gen("answer", stop="]") + "].\n"
+                break
+
+    ret = react.run(
+        question="What country does the creator of Linux live in?",
+        temperature=0.1,
+    )
+    answer = ret["answer"].lower()
+    assert "finland" in answer or "states" in answer
+
+
+def test_parallel_decoding():
+    max_tokens = 64
+    number = 5
+
+    @sgl.function
+    def parallel_decoding(s, topic):
+        s += "Act as a helpful assistant.\n"
+        s += "USER: Give some tips for " + topic + ".\n"
+        s += (
+            "ASSISTANT: Okay. Here are "
+            + str(number)
+            + " concise tips, each under 8 words:\n"
+        )
+
+        # Generate skeleton
+        for i in range(1, 1 + number):
+            s += f"{i}." + sgl.gen(max_tokens=16, stop=[".", "\n"]) + ".\n"
+
+        # Generate detailed tips
+        forks = s.fork(number)
+        for i in range(number):
+            forks[
+                i
+            ] += f"Now, I expand tip {i+1} into a detailed paragraph:\nTip {i+1}:"
+            forks[i] += sgl.gen("detailed_tip", max_tokens, stop=["\n\n"])
+        forks.join()
+
+        # Concatenate tips and summarize
+        s += "Here are these tips with detailed explanation:\n"
+        for i in range(number):
+            s += f"Tip {i+1}:" + forks[i]["detailed_tip"] + "\n"
+
+        s += "\nIn summary," + sgl.gen("summary", max_tokens=512)
+
+    ret = parallel_decoding.run(topic="writing a good blog post", temperature=0.3)
+
+
+def test_parallel_encoding(check_answer=True):
+    max_tokens = 64
+
+    @sgl.function
+    def parallel_encoding(s, question, context_0, context_1, context_2):
+        s += "USER: I will ask a question based on some statements.\n"
+        s += "ASSISTANT: Sure. I will give the answer.\n"
+        s += "USER: Please memorize these statements.\n"
+
+        contexts = [context_0, context_1, context_2]
+
+        forks = s.fork(len(contexts))
+        forks += lambda i: f"Statement {i}: " + contexts[i] + "\n"
+        forks.join(mode="concate_and_append")
+
+        s += "Now, please answer the following question. " "Do not list options."
+        s += "\nQuestion: " + question + "\n"
+        s += "ASSISTANT:" + sgl.gen("answer", max_tokens=max_tokens)
+
+    ret = parallel_encoding.run(
+        question="Who is the father of Julian?",
+        context_0="Ethan is the father of Liam.",
+        context_1="Noah is the father of Julian.",
+        context_2="Oliver is the father of Carlos.",
+        temperature=0,
+    )
+    answer = ret["answer"]
+
+    if check_answer:
+        assert "Noah" in answer
+
+
+def test_image_qa():
+    @sgl.function
+    def image_qa(s, question):
+        s += sgl.user(sgl.image("image.png") + question)
+        s += sgl.assistant(sgl.gen("answer"))
+
+    state = image_qa.run(
+        question="Please describe this image in simple words.",
+        temperature=0,
+        max_new_tokens=64,
+    )
+    assert "taxi" in state.messages()[-1]["content"]
+
+
+def test_stream():
+    @sgl.function
+    def qa(s, question):
+        s += sgl.user(question)
+        s += sgl.assistant(sgl.gen("answer"))
+
+    ret = qa(
+        question="Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        stream=True,
+    )
+    out = ""
+    for chunk in ret.text_iter():
+        out += chunk
+
+    ret = qa(
+        question="Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
+        stream=True,
+    )
+    out = ""
+    for chunk in ret.text_iter("answer"):
+        out += chunk
+
+
+def test_regex():
+    regex = r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+
+    @sgl.function
+    def regex_gen(s):
+        s += "Q: What is the IP address of the Google DNS servers?\n"
+        s += "A: " + sgl.gen(
+            "answer",
+            temperature=0,
+            regex=regex,
+        )
+
+    state = regex_gen.run()
+    answer = state["answer"]
+    assert re.match(regex, answer)
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
+"""Common utilities for testing and benchmarking"""
+import numpy as np
+import requests
+from sglang.backend.openai import OpenAI
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.global_config import global_config
+
+
+def call_generate_lightllm(prompt, temperature, max_tokens, stop, url):
+    data = {
+        "inputs": prompt,
+        "parameters": {
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "stop_sequences": stop,
+        },
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    pred = res.json()["generated_text"][0]
+    return pred
+
+
+def call_generate_vllm(prompt, temperature, max_tokens, stop, url, n=1):
+    data = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "stop": stop,
+        "n": n,
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    if n == 1:
+        pred = res.json()["text"][0][len(prompt) :]
+    else:
+        pred = [x[len(prompt) :] for x in res.json()["text"]]
+    return pred
+
+
+def call_generate_srt_raw(prompt, temperature, max_tokens, stop, url):
+    data = {
+        "text": prompt,
+        "sampling_params": {
+            "temperature": temperature,
+            "max_new_tokens": max_tokens,
+            "stop": stop,
+        },
+    }
+    res = requests.post(url, json=data)
+    assert res.status_code == 200
+    obj = res.json()
+    pred = obj["text"]
+    return pred
+
+
+def call_select_lightllm(context, choices, url):
+    scores = []
+    for i in range(len(choices)):
+        data = {
+            "inputs": context + choices[i],
+            "parameters": {
+                "max_new_tokens": 1,
+            },
+        }
+        res = requests.post(url, json=data)
+        assert res.status_code == 200
+        scores.append(0)
+    return np.argmax(scores)
+
+
+def call_select_vllm(context, choices, url):
+    scores = []
+    for i in range(len(choices)):
+        data = {
+            "prompt": context + choices[i],
+            "max_tokens": 1,
+            "prompt_logprobs": 1,
+        }
+        res = requests.post(url, json=data)
+        assert res.status_code == 200
+        scores.append(res.json()["prompt_score"])
+    return np.argmax(scores)
+
+    """
+    Modify vllm/entrypoints/api_server.py
+
+    if final_output.prompt_logprobs is not None:
+        score = np.mean([prob[t_id] for t_id, prob in zip(final_output.prompt_token_ids[1:], final_output.prompt_logprobs[1:])])
+        ret["prompt_score"] = score
+    """
+
+
+def add_common_other_args_and_parse(parser):
+    parser.add_argument("--parallel", type=int, default=96)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=None)
+    parser.add_argument(
+        "--backend",
+        type=str,
+        required=True,
+        choices=["vllm", "lightllm", "guidance", "lmql", "srt-raw", "llama.cpp"],
+    )
+    parser.add_argument(
+        "--model-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
+    )
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    args = parser.parse_args()
+
+    if args.port is None:
+        default_port = {
+            "vllm": 21000,
+            "lightllm": 22000,
+            "lmql": 23000,
+            "srt-raw": 30000,
+        }
+        args.port = default_port.get(args.backend, None)
+    return args
+
+
+def add_common_sglang_args_and_parse(parser):
+    parser.add_argument("--parallel", type=int, default=64)
+    parser.add_argument("--host", type=str, default="http://127.0.0.1")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument("--backend", type=str, default="srt")
+    parser.add_argument("--result-file", type=str, default="result.jsonl")
+    args = parser.parse_args()
+    return args
+
+
+def select_sglang_backend(args):
+    if args.backend.startswith("srt"):
+        if args.backend == "srt-no-parallel":
+            global_config.enable_parallel_decoding = False
+            global_config.enable_parallel_encoding = False
+        backend = RuntimeEndpoint(f"{args.host}:{args.port}")
+    elif args.backend.startswith("gpt"):
+        backend = OpenAI(args.backend)
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    return backend
--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
+"""Common utilities."""
+
+import base64
+import json
+import threading
+import urllib.request
+from io import BytesIO
+from json import dumps
+
+import requests
+
+
+def get_available_gpu_memory(gpu_id, distributed=True):
+    """
+    Get available memory for cuda:gpu_id device.
+    When distributed is True, the available memory is the minimum available memory of all GPUs.
+    """
+    import torch
+
+    num_gpus = torch.cuda.device_count()
+    assert gpu_id < num_gpus
+
+    if torch.cuda.current_device() != gpu_id:
+        print(
+            f"WARN: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
+            "which may cause useless memory allocation for torch CUDA context.",
+        )
+
+    free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
+
+    if distributed:
+        tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
+            torch.device("cuda", gpu_id)
+        )
+        torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
+        free_gpu_memory = tensor.item()
+
+    return free_gpu_memory / (1 << 30)
+
+
+def is_same_type(values):
+    """Return whether the elements in values are of the same type."""
+    if len(values) <= 1:
+        return True
+    else:
+        t = type(values[0])
+        return all(isinstance(v, t) for v in values[1:])
+
+
+def read_jsonl(filename: str):
+    """Read a JSONL file."""
+    rets = []
+    with open(filename) as fin:
+        for line in fin:
+            if line.startswith("#"):
+                continue
+            rets.append(json.loads(line))
+    return rets
+
+
+def dump_state_text(filename, states, mode="w"):
+    """Dump program state in a text file."""
+    from sglang.lang.interpreter import ProgramState
+
+    with open(filename, mode) as fout:
+        for i, s in enumerate(states):
+            if isinstance(s, str):
+                pass
+            elif isinstance(s, ProgramState):
+                s = s.text().strip()
+            else:
+                s = str(s)
+
+            fout.write(
+                "=" * 40 + f" {i} " + "=" * 40 + "\n" + s + "\n" + "=" * 80 + "\n\n"
+            )
+
+
+class HttpResponse:
+    def __init__(self, resp):
+        self.resp = resp
+
+    def json(self):
+        return json.loads(self.resp.read())
+
+    @property
+    def status_code(self):
+        return self.resp.status
+
+
+def http_request(url, json=None, stream=False):
+    """A faster version of requests.post with low-level urllib API."""
+    if stream:
+        return requests.post(url, json=json, stream=True)
+    else:
+        req = urllib.request.Request(url)
+        req.add_header("Content-Type", "application/json; charset=utf-8")
+        if json is None:
+            data = None
+        else:
+            data = bytes(dumps(json), encoding="utf-8")
+        resp = urllib.request.urlopen(req, data=data)
+        return HttpResponse(resp)
+
+
+def encode_image_base64(image_path):
+    """Encode an image in base64."""
+    if isinstance(image_path, str):
+        with open(image_path, "rb") as image_file:
+            data = image_file.read()
+            return base64.b64encode(data).decode("utf-8")
+    elif isinstance(image_path, bytes):
+        return base64.b64encode(image_path).decode("utf-8")
+    else:
+        # image_path is PIL.WebPImagePlugin.WebPImageFile
+        image = image_path
+        buffered = BytesIO()
+        image.save(buffered, format="PNG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+def _is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)  #
+        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+        or (cp >= 0xF900 and cp <= 0xFAFF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+    ):  #
+        return True
+
+    return False
+
+
+def find_printable_text(text):
+    """Returns the longest printable substring of text that contains only entire words."""
+    # Borrowed from https://github.com/huggingface/transformers/blob/061580c82c2db1de9139528243e105953793f7a2/src/transformers/generation/streamers.py#L99
+
+    # After the symbol for a new line, we flush the cache.
+    if text.endswith("\n"):
+        return text
+    # If the last token is a CJK character, we print the characters.
+    elif len(text) > 0 and _is_chinese_char(ord(text[-1])):
+        return text
+    # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+    # which may change with the subsequent token -- there are probably smarter ways to do this!)
+    else:
+        return text[: text.rfind(" ") + 1]
+
+
+def run_with_timeout(func, args=(), kwargs=None, timeout=None):
+    """Run a function with timeout."""
+    ret_value = []
+
+    def _target_func():
+        ret_value.append(func(*args, **(kwargs or {})))
+
+    t = threading.Thread(target=_target_func)
+    t.start()
+    t.join(timeout=timeout)
+    if t.is_alive():
+        raise TimeoutError()
+
+    if not ret_value:
+        raise RuntimeError()
+
+    return ret_value[0]
--- a/test/killall_python.sh
+++ b/test/killall_python.sh
+kill -9 $(ps aux | grep 'python' | grep -v 'grep' | awk '{print $2}')
--- a/test/lang/run_all.py
+++ b/test/lang/run_all.py
+import argparse
+import glob
+import multiprocessing
+import os
+import time
+import unittest
+
+from sglang.utils import run_with_timeout
+
+
+def run_unittest_files(files, args):
+    for filename in files:
+
+        def func():
+            print(filename)
+            ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
+
+        p = multiprocessing.Process(target=func)
+
+        def run_one_file():
+            p.start()
+            p.join()
+
+        try:
+            run_with_timeout(run_one_file, timeout=args.time_limit_per_file)
+            if p.exitcode != 0:
+                return False
+        except TimeoutError:
+            p.terminate()
+            time.sleep(5)
+            print(
+                f"\nTimeout after {args.time_limit_per_file} seconds "
+                f"when running {filename}"
+            )
+            return False
+
+    return True
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument(
+        "--time-limit-per-file",
+        type=int,
+        default=1000,
+        help="The time limit for running one file in seconds.",
+    )
+    args = arg_parser.parse_args()
+
+    files = glob.glob("**/test_*.py", recursive=True)
+
+    tic = time.time()
+    success = run_unittest_files(files, args)
+
+    if success:
+        print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
+    else:
+        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
+
+    exit(0 if success else -1)
--- a/test/lang/test_anthropic_backend.py
+++ b/test/lang/test_anthropic_backend.py
+import json
+import unittest
+
+from sglang.test.test_programs import test_mt_bench, test_stream
+
+from sglang import Anthropic, set_default_backend
+
+
+class TestAnthropicBackend(unittest.TestCase):
+    backend = None
+    chat_backend = None
+
+    def setUp(self):
+        cls = type(self)
+
+        if cls.backend is None:
+            cls.backend = Anthropic("claude-2")
+            set_default_backend(cls.backend)
+
+    def test_mt_bench(self):
+        test_mt_bench()
+
+    def test_stream(self):
+        test_stream()
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")
+
+    # from sglang.global_config import global_config
+
+    # global_config.verbosity = 2
+    # t = TestAnthropicBackend()
+    # t.setUp()
+    # t.test_mt_bench()
--- a/test/lang/test_bind_pin.py
+++ b/test/lang/test_bind_pin.py
+import unittest
+
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+
+import sglang as sgl
+
+
+class TestBind(unittest.TestCase):
+    backend = None
+
+    def setUp(self):
+        cls = type(self)
+
+        if cls.backend is None:
+            cls.backend = RuntimeEndpoint(base_url="http://localhost:30000")
+
+    def test_bind(self):
+        @sgl.function
+        def few_shot_qa(s, prompt, question):
+            s += prompt
+            s += "Q: What is the capital of France?\n"
+            s += "A: Paris\n"
+            s += "Q: " + question + "\n"
+            s += "A:" + sgl.gen("answer", stop="\n")
+
+        few_shot_qa_2 = few_shot_qa.bind(
+            prompt="The following are questions with answers.\n\n"
+        )
+
+        tracer = few_shot_qa_2.trace()
+        print(tracer.last_node.print_graph_dfs() + "\n")
+
+    def test_pin(self):
+        @sgl.function
+        def few_shot_qa(s, prompt, question):
+            s += prompt
+            s += "Q: What is the capital of France?\n"
+            s += "A: Paris\n"
+            s += "Q: " + question + "\n"
+            s += "A:" + sgl.gen("answer", stop="\n")
+
+        few_shot_qa_2 = few_shot_qa.bind(
+            prompt="Answer the following questions as if you were a 5-year-old kid.\n\n"
+        )
+        few_shot_qa_2.pin(self.backend)
+        few_shot_qa_2.unpin(self.backend)
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")
+
+    # t = TestBind()
+    # t.setUp()
+    # t.test_pin()
--- a/test/lang/test_openai_backend.py
+++ b/test/lang/test_openai_backend.py
+import unittest
+
+from sglang.test.test_programs import (
+    test_decode_int,
+    test_decode_json,
+    test_expert_answer,
+    test_few_shot_qa,
+    test_image_qa,
+    test_mt_bench,
+    test_parallel_decoding,
+    test_parallel_encoding,
+    test_react,
+    test_select,
+    test_stream,
+    test_tool_use,
+)
+
+from sglang import OpenAI, set_default_backend
+
+
+class TestOpenAIBackend(unittest.TestCase):
+    backend = None
+    chat_backend = None
+    chat_vision_backend = None
+
+    def setUp(self):
+        cls = type(self)
+
+        if cls.backend is None:
+            cls.backend = OpenAI("gpt-3.5-turbo-instruct")
+            cls.chat_backend = OpenAI("gpt-3.5-turbo")
+            cls.chat_vision_backend = OpenAI("gpt-4-vision-preview")
+
+    def test_few_shot_qa(self):
+        set_default_backend(self.backend)
+        test_few_shot_qa()
+
+    def test_mt_bench(self):
+        set_default_backend(self.chat_backend)
+        test_mt_bench()
+
+    def test_select(self):
+        set_default_backend(self.backend)
+        test_select(check_answer=True)
+
+    def test_decode_int(self):
+        set_default_backend(self.backend)
+        test_decode_int()
+
+    def test_decode_json(self):
+        set_default_backend(self.backend)
+        test_decode_json()
+
+    def test_expert_answer(self):
+        set_default_backend(self.backend)
+        test_expert_answer()
+
+    def test_tool_use(self):
+        set_default_backend(self.backend)
+        test_tool_use()
+
+    def test_react(self):
+        set_default_backend(self.backend)
+        test_react()
+
+    def test_parallel_decoding(self):
+        set_default_backend(self.backend)
+        test_parallel_decoding()
+
+    def test_parallel_encoding(self):
+        set_default_backend(self.backend)
+        test_parallel_encoding()
+
+    def test_image_qa(self):
+        set_default_backend(self.chat_vision_backend)
+        test_image_qa()
+
+    def test_stream(self):
+        set_default_backend(self.backend)
+        test_stream()
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")
+
+    # from sglang.global_config import global_config
+
+    # global_config.verbosity = 2
+    # t = TestOpenAIBackend()
+    # t.setUp()
+    # t.test_decode_json()
--- a/test/lang/test_srt_backend.py
+++ b/test/lang/test_srt_backend.py
+"""
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+"""
+import json
+import unittest
+
+from sglang.test.test_programs import (
+    test_decode_int,
+    test_decode_json,
+    test_expert_answer,
+    test_few_shot_qa,
+    test_mt_bench,
+    test_parallel_decoding,
+    test_parallel_encoding,
+    test_react,
+    test_regex,
+    test_select,
+    test_stream,
+    test_tool_use,
+)
+
+import sglang as sgl
+
+
+class TestSRTBackend(unittest.TestCase):
+    backend = None
+
+    def setUp(self):
+        cls = type(self)
+
+        if cls.backend is None:
+            cls.backend = sgl.RuntimeEndpoint(base_url="http://localhost:30000")
+            sgl.set_default_backend(cls.backend)
+
+    def test_few_shot_qa(self):
+        test_few_shot_qa()
+
+    def test_mt_bench(self):
+        test_mt_bench()
+
+    def test_select(self):
+        test_select(check_answer=False)
+
+    def test_decode_int(self):
+        test_decode_int()
+
+    def test_expert_answer(self):
+        test_expert_answer()
+
+    def test_tool_use(self):
+        test_tool_use()
+
+    def test_parallel_decoding(self):
+        test_parallel_decoding()
+
+    def test_stream(self):
+        test_stream()
+
+    def test_regex(self):
+        test_regex()
+
+    # def test_parallel_encoding(self):
+    #     test_parallel_encoding(check_answer=False)
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")
+
+    # from sglang.global_config import global_config
+
+    # global_config.verbosity = 2
+    # t = TestSRTBackend()
+    # t.setUp()
+    # t.test_regex()
--- a/test/lang/test_tracing.py
+++ b/test/lang/test_tracing.py
+import unittest
+
+from sglang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+
+import sglang as sgl
+
+
+class TestTracing(unittest.TestCase):
+    def test_few_shot_qa(self):
+        @sgl.function
+        def few_shot_qa(s, question):
+            s += "The following are questions with answers.\n\n"
+            s += "Q: What is the capital of France?\n"
+            s += "A: Paris\n"
+            s += "Q: " + question + "\n"
+            s += "A:" + sgl.gen("answer", stop="\n")
+
+        tracer = few_shot_qa.trace()
+        print(tracer.last_node.print_graph_dfs() + "\n")
+
+    def test_select(self):
+        @sgl.function
+        def capital(s):
+            s += "The capital of France is"
+            s += sgl.select("capital", ["Paris. ", "London. "])
+            s += "It is a city" + sgl.gen("description", stop=".")
+
+        tracer = capital.trace()
+        print(tracer.last_node.print_graph_dfs() + "\n")
+
+    def test_raise_warning(self):
+        @sgl.function
+        def wrong(s, question):
+            s += f"I want to ask {question}"
+
+        try:
+            tracer = wrong.trace()
+            raised = False
+        except TypeError:
+            raised = True
+
+        assert raised
+
+    def test_multi_function(self):
+        @sgl.function
+        def expand(s, tip):
+            s += (
+                "Please expand the following tip into a detailed paragraph:"
+                + tip
+                + "\n"
+            )
+            s += sgl.gen("detailed_tip")
+
+        @sgl.function
+        def tip_suggestion(s, topic):
+            s += "Here are 2 tips for " + topic + ".\n"
+
+            s += "1." + sgl.gen("tip_1", stop=["\n", ":", "."]) + "\n"
+            s += "2." + sgl.gen("tip_2", stop=["\n", ":", "."]) + "\n"
+
+            branch1 = expand(tip=s["tip_1"])
+            branch2 = expand(tip=s["tip_2"])
+
+            s += "Tip 1: " + branch1["detailed_tip"] + "\n"
+            s += "Tip 2: " + branch2["detailed_tip"] + "\n"
+            s += "In summary" + sgl.gen("summary")
+
+        compiled = tip_suggestion.compile()
+        compiled.print_graph()
+
+        sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+        state = compiled.run(topic="staying healthy")
+        print(state.text() + "\n")
+
+        states = compiled.run_batch(
+            [
+                {"topic": "staying healthy"},
+                {"topic": "staying happy"},
+                {"topic": "earning money"},
+            ],
+            temperature=0,
+        )
+        for s in states:
+            print(s.text() + "\n")
+
+    def test_role(self):
+        @sgl.function
+        def multi_turn_chat(s):
+            s += sgl.user("Who are you?")
+            s += sgl.assistant(sgl.gen("answer_1"))
+            s += sgl.user("Who created you?")
+            s += sgl.assistant(sgl.gen("answer_2"))
+
+        backend = BaseBackend()
+        backend.chat_template = get_chat_template("llama-2-chat")
+
+        compiled = multi_turn_chat.compile(backend=backend)
+        compiled.print_graph()
+
+    def test_fork(self):
+        @sgl.function
+        def tip_suggestion(s):
+            s += (
+                "Here are three tips for staying healthy: "
+                "1. Balanced Diet; "
+                "2. Regular Exercise; "
+                "3. Adequate Sleep\n"
+            )
+
+            forks = s.fork(3)
+            for i in range(3):
+                forks[i] += f"Now, expand tip {i+1} into a paragraph:\n"
+                forks[i] += sgl.gen(f"detailed_tip")
+
+            s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
+            s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
+            s += "Tip 3:" + forks[2]["detailed_tip"] + "\n"
+            s += "In summary" + sgl.gen("summary")
+
+        tracer = tip_suggestion.trace()
+        print(tracer.last_node.print_graph_dfs())
+
+        a = tip_suggestion.run(backend=sgl.OpenAI("gpt-3.5-turbo-instruct"))
+        print(a.text())
+
+
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")
+
+    # t = TestTracing()
+    # t.test_fork()
--- a/test/srt/model/bench_llama_low_api.py
+++ b/test/srt/model/bench_llama_low_api.py
+import multiprocessing as mp
+import time
+from dataclasses import dataclass
+
+import torch
+import torch.distributed as dist
+from sglang.srt.managers.router.model_runner import ModelRunner
+from sglang.srt.model_config import ModelConfig
+
+
+@dataclass
+class BenchBatch:
+    req_to_token_pool: torch.Tensor
+    token_to_kv_pool: torch.Tensor
+
+    input_ids: torch.Tensor = None
+    position_ids_offsets: torch.Tensor = None
+    seq_lens: torch.Tensor = None
+    prefix_lens: torch.Tensor = None
+    req_pool_indices: torch.Tensor = None
+    out_cache_loc: torch.Tensor = None
+    out_cache_cont_start: torch.Tensor = None
+    out_cache_cont_end: torch.Tensor = None
+
+    def __init__(self, model_runner: ModelRunner):
+        self.req_to_token_pool = model_runner.req_to_token_pool
+        self.token_to_kv_pool = model_runner.token_to_kv_pool
+
+    def init_prefill_batch(self, input_ids, batch_size, seq_len):
+        self.input_ids = input_ids
+        self.position_ids_offsets = torch.zeros(
+            batch_size, dtype=torch.int32, device="cuda"
+        )
+        self.seq_lens = torch.full(
+            (batch_size,), seq_len, dtype=torch.int32, device="cuda"
+        )
+        self.prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        self.req_pool_indices = self.req_to_token_pool.alloc(batch_size)
+        self.out_cache_loc = self.token_to_kv_pool.alloc(batch_size * seq_len)
+
+        for i in range(batch_size):
+            n_idx = self.req_pool_indices[i].item()
+            self.req_to_token_pool.req_to_token[n_idx, :seq_len] = self.out_cache_loc[
+                i * seq_len : (i + 1) * seq_len
+            ]
+
+    def update_extend(
+        self, input_ids, batch_size, prefix_len, extend_len, prefix_req_idx
+    ):
+        self.input_ids = input_ids
+        self.position_ids_offsets = torch.zeros(
+            batch_size, dtype=torch.int32, device="cuda"
+        )
+        self.seq_lens = torch.full(
+            (batch_size,), prefix_len + extend_len, dtype=torch.int32, device="cuda"
+        )
+        self.prefix_lens = torch.full(
+            (batch_size,), prefix_len, dtype=torch.int32, device="cuda"
+        )
+        self.req_pool_indices = self.req_to_token_pool.alloc(batch_size)
+        self.out_cache_loc = self.token_to_kv_pool.alloc(batch_size * extend_len)
+
+        req_to_token = self.req_to_token_pool.req_to_token
+        fork_num = batch_size // prefix_req_idx.shape[0]
+        for i in range(batch_size):
+            p_idx = prefix_req_idx[i // fork_num].item()
+            n_idx = self.req_pool_indices[i].item()
+            req_to_token[n_idx, :prefix_len] = req_to_token[p_idx, :prefix_len]
+            req_to_token[
+                n_idx, prefix_len : prefix_len + extend_len
+            ] = self.out_cache_loc[i * extend_len : (i + 1) * extend_len]
+
+    def update_decode(self, predict_ids, batch_size):
+        assert predict_ids.shape[0] == batch_size
+        assert batch_size == self.req_pool_indices.shape[0]
+
+        self.input_ids = predict_ids.reshape(-1)
+        self.prefix_lens = None
+        (
+            self.out_cache_loc,
+            self.out_cache_cont_start,
+            self.out_cache_cont_end,
+        ) = self.token_to_kv_pool.alloc_contiguous(batch_size)
+        self.req_to_token_pool.req_to_token[
+            self.req_pool_indices, self.seq_lens
+        ] = self.out_cache_loc
+        self.seq_lens.add_(1)
+
+
+def prefill(model_runner: ModelRunner, batch: BenchBatch):
+    logits, _ = model_runner.forward_extend(
+        batch.input_ids,
+        batch.req_pool_indices,
+        batch.seq_lens,
+        batch.prefix_lens,
+        batch.position_ids_offsets,
+        batch.out_cache_loc,
+        False,
+    )
+
+    prob_out = torch.softmax(logits, dim=-1)
+    predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+    predict_ids = predict_ids.detach().cpu().numpy()
+
+    return predict_ids
+
+
+def extend(model_runner: ModelRunner, batch: BenchBatch):
+    logits, _ = model_runner.forward_extend(
+        batch.input_ids,
+        batch.req_pool_indices,
+        batch.seq_lens,
+        batch.prefix_lens,
+        batch.position_ids_offsets,
+        batch.out_cache_loc,
+        True,
+    )
+
+    prob_out = torch.softmax(logits, dim=-1)
+    predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+    predict_ids = predict_ids.detach().cpu().numpy()
+
+    return predict_ids
+
+
+def decode(model_runner: ModelRunner, batch: BenchBatch):
+    logits = model_runner.forward_decode(
+        batch.input_ids,
+        batch.req_pool_indices,
+        batch.seq_lens,
+        None,
+        batch.position_ids_offsets,
+        None,
+        batch.out_cache_cont_start,
+        batch.out_cache_cont_end,
+    )
+
+    prob_out = torch.softmax(logits, dim=-1)
+    predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+    predict_ids = predict_ids.detach().cpu().numpy()
+
+    return predict_ids
+
+
+def bench_generate_worker(
+    model_path,
+    tp_rank,
+    tp_size,
+    shared_num,
+    unique_num,
+    shared_len,
+    unique_len,
+    decode_len,
+    model_mode,
+):
+    assert unique_num % shared_num == 0
+
+    model_config = ModelConfig(path=model_path)
+    model_runner = ModelRunner(
+        model_config=model_config,
+        mem_fraction_static=0.8,
+        tp_rank=tp_rank,
+        tp_size=tp_size,
+        nccl_port=28888,
+        model_mode=model_mode,
+    )
+
+    batch = BenchBatch(model_runner)
+
+    # warm up
+    for _ in range(1):
+        input_ids = torch.randint(
+            low=5, high=100, size=(shared_num * shared_len,)
+        ).cuda()
+        batch.init_prefill_batch(input_ids, shared_num, shared_len)
+        _ = prefill(model_runner, batch)
+
+        input_ids = torch.randint(
+            low=5, high=100, size=(unique_num * unique_len,)
+        ).cuda()
+        batch.update_extend(
+            input_ids, unique_num, shared_len, unique_len, batch.req_pool_indices
+        )
+        predict_ids = extend(model_runner, batch)
+
+        for i in range(decode_len):
+            predict_ids = torch.from_numpy(predict_ids).cuda()
+            batch.update_decode(predict_ids, unique_num)
+            predict_ids = decode(model_runner, batch)
+
+        model_runner.req_to_token_pool.clear()
+        model_runner.token_to_kv_pool.clear()
+
+    if tp_size > 1:
+        dist.barrier()
+
+    prefill_start = time.time()
+    input_ids = torch.randint(low=5, high=100, size=(shared_num * shared_len,)).cuda()
+    batch.init_prefill_batch(input_ids, shared_num, shared_len)
+    _ = prefill(model_runner, batch)
+    if tp_rank == 0:
+        print(f"prefill: {(time.time() - prefill_start) * 1000:.2f} ms")
+
+    extend_start = time.time()
+    input_ids = torch.randint(low=5, high=100, size=(unique_num * unique_len,)).cuda()
+    batch.update_extend(
+        input_ids, unique_num, shared_len, unique_len, batch.req_pool_indices
+    )
+    predict_ids = extend(model_runner, batch)
+    if tp_rank == 0:
+        print(f"extend: {(time.time() - extend_start) * 1000:.2f} ms")
+
+    for i in range(decode_len):
+        decode_start = time.time()
+        predict_ids = torch.from_numpy(predict_ids).cuda()
+        batch.update_decode(predict_ids, unique_num)
+        predict_ids = decode(model_runner, batch)
+        if tp_rank == 0:
+            print(f"decode {i}: {(time.time() - decode_start) * 1000:.2f} ms")
+
+
+def bench_generate(
+    model_path,
+    tp_size,
+    shared_num,
+    unique_num,
+    shared_len,
+    unique_len,
+    decode_len,
+    model_mode,
+):
+    print(
+        f"tp_size: {tp_size}, "
+        f"shared_num: {shared_num}, "
+        f"unique_num: {unique_num}, "
+        f"shared_len: {shared_len}, "
+        f"unique_len: {unique_len}, "
+        f"decode_len: {decode_len}, "
+        f"model_mode: {model_mode}"
+    )
+    workers = []
+    for tp_rank in range(tp_size):
+        proc = mp.Process(
+            target=bench_generate_worker,
+            args=(
+                model_path,
+                tp_rank,
+                tp_size,
+                shared_num,
+                unique_num,
+                shared_len,
+                unique_len,
+                decode_len,
+                model_mode,
+            ),
+        )
+        proc.start()
+        workers.append(proc)
+
+    for proc in workers:
+        proc.join()
+
+
+if __name__ == "__main__":
+    bench_generate(
+        model_path="meta-llama/Llama-2-7b-chat-hf",
+        tp_size=1,
+        shared_num=1,
+        unique_num=32,
+        shared_len=256,
+        unique_len=256,
+        decode_len=8,
+        model_mode=[],
+    )
--- a/test/srt/model/reference_hf.py
+++ b/test/srt/model/reference_hf.py
+import argparse
+import os
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+@torch.inference_mode()
+def normal_text(args):
+    t = AutoTokenizer.from_pretrained(args.model_path)
+    m = AutoModelForCausalLM.from_pretrained(
+        args.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    m.cuda()
+
+    print(m)
+
+    prompts = [
+        "The capital of France is",
+        "The capital of the United Kindom is",
+        "Today is a sunny day and I like",
+    ]
+    max_new_tokens = 32
+
+    for p in prompts:
+        if isinstance(p, str):
+            input_ids = t.encode(p, return_tensors="pt").cuda()
+        else:
+            input_ids = torch.tensor([p], device="cuda")
+
+        output_ids = m.generate(
+            input_ids, do_sample=False, max_new_tokens=max_new_tokens
+        )
+        output_str = t.decode(output_ids[0])
+        print(output_str)
+
+        prefill_logits = m.forward(input_ids).logits[0][-1]
+        print("prefill logits", prefill_logits)
+
+
+@torch.inference_mode()
+def synthetic_tokens(args):
+    t = AutoTokenizer.from_pretrained(args.model_path)
+    m = AutoModelForCausalLM.from_pretrained(
+        args.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    m.cuda()
+    print(m)
+
+    input_len = 256
+    output_len = 8
+    prompts = [list(range(5, 5 + input_len))]
+
+    for p in prompts:
+        input_ids = p
+        for i in range(output_len + 1):
+            prefill_logits = m.forward(torch.tensor([input_ids], device="cuda")).logits[
+                0
+            ][-1]
+
+            if i == 0:
+                print("prefill logits", prefill_logits)
+            else:
+                print("decode", i - 1, prefill_logits)
+
+            input_ids.append(torch.argmax(prefill_logits).item())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="TinyLlama/TinyLlama-1.1B-Chat-v0.4",
+        # default="meta-llama/Llama-2-7b-chat-hf",
+    )
+    args = parser.parse_args()
+
+    normal_text(args)
+    # synthetic_tokens(args)
--- a/test/srt/model/test_llama_extend.py
+++ b/test/srt/model/test_llama_extend.py
+import multiprocessing
+import os
+import time
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import transformers
+from sglang.srt.managers.router.infer_batch import Batch, ForwardMode, Req
+from sglang.srt.managers.router.model_runner import ModelRunner
+from sglang.srt.model_config import ModelConfig
+from sglang.srt.sampling_params import SamplingParams
+
+
+def test_generate_worker(model_path, tp_rank, tp_size):
+    model_config = ModelConfig(path=model_path)
+    model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+
+    # Input
+    prompts = [
+        "The capital of France is",
+        "Today is a sunny day and I like",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+
+    cut_num = 4
+
+    reqs = []
+    for i in range(len(prompts)):
+        req = Req(i)
+        req.input_ids = tokenizer.encode(prompts[i])[:cut_num]
+        req.sampling_params = sampling_params
+        reqs.append(req)
+
+    # Prefill
+    batch = Batch(reqs, model.req_to_token_pool, model.token_to_kv_pool, None)
+    batch.init_extend_batch(model.model_config.vocab_size(), None)
+    logits, _ = model.forward(batch, ForwardMode.EXTEND)
+    next_token_ids, next_token_probs = batch.sample(logits)
+    print("extend logits (first)", logits)
+
+    # Extend
+    for i in range(len(prompts)):
+        req = reqs[i]
+        req.input_ids += tokenizer.encode(prompts[i])[cut_num:]
+        req.prefix_indices = model.req_to_token_pool.req_to_token[
+            batch.req_pool_indices[i], :cut_num
+        ]
+    batch = Batch(reqs, model.req_to_token_pool, model.token_to_kv_pool, None)
+    batch.init_extend_batch(model.model_config.vocab_size(), None)
+    logits, _ = model.forward(batch, ForwardMode.EXTEND)
+    next_token_ids, next_token_probs = batch.sample(logits)
+
+    print("extend logits", logits)
+    print(
+        "next_token_ids", next_token_ids, [tokenizer.decode(x) for x in next_token_ids]
+    )
+
+    # Decode
+    for i in range(6):
+        batch.update_for_decode(next_token_ids.cpu().numpy())
+        logits = model.forward(batch, ForwardMode.DECODE)
+        next_token_ids, next_token_probs = batch.sample(logits)
+
+        print(
+            "next_token_ids",
+            next_token_ids,
+            [tokenizer.decode(x) for x in next_token_ids],
+        )
+
+
+def test_generate(model_path, tp_size):
+    workers = []
+    for tp_rank in range(tp_size):
+        proc = multiprocessing.Process(
+            target=test_generate_worker,
+            args=(
+                model_path,
+                tp_rank,
+                tp_size,
+            ),
+        )
+        proc.start()
+        workers.append(proc)
+
+    for proc in workers:
+        proc.join()
+
+
+if __name__ == "__main__":
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    test_generate("TinyLlama/TinyLlama-1.1B-Chat-v0.4", 1)
+
+    # Reference output for TinyLlama-1.1B-Chat-v0.4
+    # extend logits (first) tensor([[-10.0312,  -9.5000,   0.8896,  ...,  -4.9375,  -3.2402,  -3.3633],
+    #             [ -9.1797, -10.2500,   2.7168,  ...,  -4.3359,  -4.0664,  -4.1289]],
+    #                    device='cuda:0', dtype=torch.float16)
+    # extend logits tensor([[-8.3125, -7.1172,  3.3359,  ..., -4.9531, -4.1289, -3.4121],
+    #             [-9.6406, -9.0547,  4.0195,  ..., -5.3086, -4.7188, -4.4609]],
+    #                    device='cuda:0', dtype=torch.float16)
+    # next_token_ids tensor([3681,  304], device='cuda:0') ['Paris', 'to']
+    # next_token_ids tensor([29889,   748], device='cuda:0') ['.', 'go']
+    # next_token_ids tensor([ 13, 363], device='cuda:0') ['\n', 'for']
+    # next_token_ids tensor([1576,  263], device='cuda:0') ['The', 'a']
+    # next_token_ids tensor([7483, 6686], device='cuda:0') ['capital', 'walk']
+    # next_token_ids tensor([310, 297], device='cuda:0') ['of', 'in']
+    # next_token_ids tensor([278, 278], device='cuda:0') ['the', 'the']
--- a/test/srt/model/test_llama_low_api.py
+++ b/test/srt/model/test_llama_low_api.py
+import multiprocessing
+import time
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from sglang.srt.managers.router.model_runner import ModelRunner
+from sglang.srt.model_config import ModelConfig
+
+
+def test_generate_worker(
+    model_path, tp_rank, tp_size, batch_size, input_len, output_len
+):
+    model_config = ModelConfig(path=model_path)
+    model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
+
+    # Prepare data
+    input_ids = np.vstack([np.arange(5, input_len + 5) for _ in range(batch_size)])
+    input_ids = input_ids.reshape(-1)
+    input_ids = torch.tensor(input_ids).cuda()
+
+    def init_batch_data(model, batch_size, input_len):
+        req_pool_indices = model.req_to_token_pool.alloc(batch_size)
+        seq_lens = torch.full(
+            (batch_size,), input_len, dtype=torch.int32, device="cuda"
+        )
+        prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        position_ids_offsets = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+
+        out_cache_loc = model.token_to_kv_pool.alloc(batch_size * input_len)
+        for i in range(batch_size):
+            req_idx = req_pool_indices[i].item()
+            model.req_to_token_pool.req_to_token[req_idx, :input_len] = out_cache_loc[
+                i * input_len : (i + 1) * input_len
+            ]
+
+        return (
+            req_pool_indices,
+            seq_lens,
+            prefix_lens,
+            position_ids_offsets,
+            out_cache_loc,
+        )
+
+    def prefill(print_logits):
+        nonlocal predict_ids
+
+        logits, _ = model.forward_prefill(
+            input_ids,
+            req_pool_indices,
+            seq_lens,
+            prefix_lens,
+            position_ids_offsets,
+            out_cache_loc,
+            False,
+        )
+        prob_out = torch.softmax(logits, dim=-1)
+        predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+        predict_ids = predict_ids.detach().cpu().numpy()
+
+        if print_logits and tp_rank == 0:
+            print("prefill logits", logits, logits.shape)
+
+    def decode(print_logits):
+        nonlocal predict_ids
+
+        (
+            out_cache_loc,
+            out_cache_cont_start,
+            out_cache_cont_end,
+        ) = model.token_to_kv_pool.alloc_contiguous(batch_size)
+        model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
+        seq_lens.add_(1)
+        logits = model.forward_decode(
+            torch.from_numpy(predict_ids).cuda().reshape(-1),
+            req_pool_indices,
+            seq_lens,
+            None,
+            position_ids_offsets,
+            None,
+            out_cache_cont_start,
+            out_cache_cont_end,
+        )
+        prob_out = torch.softmax(logits, dim=-1)
+        predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+        predict_ids = predict_ids.detach().cpu().numpy()
+        if print_logits and tp_rank == 0:
+            print("decode", i, logits)
+
+    # Warm up
+    (
+        req_pool_indices,
+        seq_lens,
+        prefix_lens,
+        position_ids_offsets,
+        out_cache_loc,
+    ) = init_batch_data(model, batch_size, input_len)
+    predict_ids = None
+
+    prefill(True)
+    for i in range(output_len):
+        decode(True)
+
+    for i in range(batch_size):
+        req_idx = req_pool_indices[i].item()
+        model.token_to_kv_pool.free(
+            model.req_to_token_pool.req_to_token[req_idx, : seq_lens[i]]
+        )
+    model.req_to_token_pool.free(req_pool_indices)
+
+    # Benchmark
+    if tp_size > 1:
+        dist.barrier()
+    start_time = prefill_start_time = time.time()
+
+    (
+        req_pool_indices,
+        seq_lens,
+        prefix_lens,
+        position_ids_offsets,
+        out_cache_loc,
+    ) = init_batch_data(model, batch_size, input_len)
+
+    prefill(False)
+
+    if tp_rank == 0:
+        print(f"prefill cost: {(time.time() - prefill_start_time) * 1000:.2f} ms")
+
+    for i in range(output_len):
+        step_start = time.time()
+
+        decode(False)
+
+        step_end = time.time()
+
+        if i % 100 == 0 or i == output_len - 1:
+            if tp_rank == 0:
+                print(f"step {i} cost: {(step_end - step_start) * 1000:.2f} ms")
+
+    end_time = time.time()
+
+    if tp_rank == 0:
+        print(f"total cost: {(end_time - start_time) * 1000:.2f}")
+
+
+def test_generate(model_path, tp_size, batch_size, input_len, output_len):
+    workers = []
+    for tp_rank in range(tp_size):
+        proc = multiprocessing.Process(
+            target=test_generate_worker,
+            args=(
+                model_path,
+                tp_rank,
+                tp_size,
+                batch_size,
+                input_len,
+                output_len,
+            ),
+        )
+        proc.start()
+        workers.append(proc)
+
+    for proc in workers:
+        proc.join()
+
+
+if __name__ == "__main__":
+    test_generate("TinyLlama/TinyLlama-1.1B-Chat-v0.4", 1, 1, 256, 8)
+    # test_generate("meta-llama/Llama-2-7b-chat-hf", 1, 16, 256, 8)
+
+    # Reference output for TinyLlama-1.1B-Chat-v0.4 (1, 32, 8)
+    # prefill logits tensor([[-1.3380e-03,  4.4702e-01,  2.9082e+00,  ..., -1.8398e+00,
+    #               1.8281e+00,  2.1816e+00]], device='cuda:0')
+    # decode 0 tensor([[-0.3904,  0.8784,  3.6934,  ..., -2.4473,  1.5811,  2.0098]],
+    #                device='cuda:0')
+    # decode 1 tensor([[-0.3552,  0.0635,  2.5781,  ..., -2.5820,  1.3047,  1.7607]],
+    #                device='cuda:0')
+    # decode 2 tensor([[-1.5645, -1.1963,  3.8145,  ..., -2.9766,  1.0244,  1.0645]],
+    #                device='cuda:0')
+    # decode 3 tensor([[-1.3682, -0.6548,  4.2734,  ..., -2.8711,  1.1172,  1.1494]],
+    #                device='cuda:0')
+    # decode 4 tensor([[-1.0205, -0.0060,  4.4844,  ..., -2.7090,  1.6143,  1.8135]],
+    #                device='cuda:0')
+    # decode 5 tensor([[ 0.4260,  1.6006,  4.3633,  ..., -2.2480,  2.5547,  2.8379]],
+    #                device='cuda:0')
+    # decode 6 tensor([[ 0.7095,  2.1816,  5.0078,  ..., -2.1309,  3.0293,  3.0840]],
+    #                device='cuda:0')
+    # decode 7 tensor([[-0.2883,  1.1289,  4.7188,  ..., -2.4023,  2.1055,  2.1836]],
+    #                device='cuda:0')
+
+    # Reference output for TinyLlama-1.1B-Chat-v0.4 (1, 256, 8)
+    # prefill logits tensor([[-2.5840, -2.7227,  6.8047,  ..., -2.3613,  0.1224,  0.5952]],
+    #        device='cuda:0')
+    # decode 0 tensor([[-0.6235, -0.7690,  9.2891,  ..., -1.4922,  2.8008,  2.9531]],
+    #        device='cuda:0')
+    # decode 1 tensor([[-1.3662, -1.4648,  7.1250,  ..., -1.7861,  1.7363,  1.8857]],
+    #        device='cuda:0')
+    # decode 2 tensor([[-0.8540, -0.5947,  9.1328,  ..., -2.1211,  2.9707,  2.8945]],
+    #        device='cuda:0')
+    # decode 3 tensor([[ 0.0652,  1.0312,  8.1250,  ..., -2.0586,  3.4727,  3.6172]],
+    #        device='cuda:0')
+    # decode 4 tensor([[-0.0459,  1.0098,  9.1406,  ..., -2.1797,  3.8320,  3.9355]],
+    #        device='cuda:0')
+    # decode 5 tensor([[ 0.2964,  1.3564,  9.8828,  ..., -2.1602,  4.1836,  4.2422]],
+    #        device='cuda:0')
+    # decode 6 tensor([[ 0.6475,  1.8105, 10.1250,  ..., -2.0098,  4.2578,  4.4062]],
+    #        device='cuda:0')
+    # decode 7 tensor([[ 0.4985,  1.4746,  9.9062,  ..., -1.9141,  3.9863,  4.3047]],
+    #        device='cuda:0')
--- a/test/srt/model/test_llava_low_api.py
+++ b/test/srt/model/test_llava_low_api.py
+import multiprocessing
+import time
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.managers.router.infer_batch import ForwardMode
+from sglang.srt.managers.router.model_runner import InputMetadata, ModelRunner
+from sglang.srt.model_config import ModelConfig
+from sglang.srt.utils import load_image
+
+
+def init_batch_data(model, batch_size, input_len):
+    req_pool_indices = model.req_to_token_pool.alloc(batch_size)
+    seq_lens = torch.full((batch_size,), input_len, dtype=torch.int32, device="cuda")
+    prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+    position_ids_offsets = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+
+    out_cache_loc = model.token_to_kv_pool.alloc(batch_size * input_len)
+    for i in range(batch_size):
+        model.req_to_token_pool.req_to_token[i, :input_len] = out_cache_loc[
+            i * input_len : (i + 1) * input_len
+        ]
+
+    return (
+        req_pool_indices,
+        seq_lens,
+        prefix_lens,
+        position_ids_offsets,
+        out_cache_loc,
+    )
+
+
+def prefill(model, tp_rank, params, print_logits):
+    logits, _ = model.forward_extend_multi_modal(
+        *params,
+        False,
+    )
+    prob_out = torch.softmax(logits, dim=-1)
+    predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+    predict_ids = predict_ids.detach().cpu().numpy()
+
+    if print_logits and tp_rank == 0:
+        print("prefill logits", logits, logits.shape)
+
+    return predict_ids
+
+
+def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
+    (
+        req_pool_indices,
+        seq_lens,
+        prefix_lens,
+        position_ids_offsets,
+        out_cache_loc,
+    ) = params
+
+    (
+        out_cache_loc,
+        out_cache_cont_start,
+        out_cache_cont_end,
+    ) = model.token_to_kv_pool.alloc_contiguous(batch_size)
+    model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
+    seq_lens.add_(1)
+    logits = model.forward_decode(
+        torch.from_numpy(predict_ids).cuda().reshape(-1),
+        req_pool_indices,
+        seq_lens,
+        None,
+        position_ids_offsets,
+        None,
+        out_cache_cont_start,
+        out_cache_cont_end,
+    )
+    prob_out = torch.softmax(logits, dim=-1)
+    predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
+    predict_ids = predict_ids.detach().cpu().numpy()
+    if print_logits and tp_rank == 0:
+        print("decode", step, logits)
+    return predict_ids
+
+
+def test_generate_worker(
+    model_path,
+    tp_rank,
+    tp_size,
+):
+    model_config = ModelConfig(path=model_path)
+    model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
+    # print(model.model)
+
+    # Prepare data
+    prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:"
+    image_path = "/home/ubuntu/sglang/test/lang/image.png"
+    image = load_image(image_path)
+
+    processor = get_processor("llava-hf/llava-1.5-7b-hf")
+    input_ids = processor.tokenizer.encode(prompt)
+    pixel_values = processor.image_processor(image)["pixel_values"]
+    input_ids, offset = model.model.pad_input_ids(
+        input_ids,
+        [
+            0,
+        ],
+    )
+
+    params = init_batch_data(model, 1, len(input_ids))
+
+    # inference
+    output_ids = []
+    prefill_params = (
+        torch.tensor(np.array(input_ids)).cuda(),
+        np.array(pixel_values),
+        [offset],
+        *params,
+    )
+    predict_ids = prefill(model, tp_rank=0, params=prefill_params, print_logits=False)
+    output_ids.append(predict_ids[0][0])
+    for i in range(16):
+        predict_ids = decode(
+            i,
+            model,
+            tp_rank=0,
+            batch_size=1,
+            predict_ids=predict_ids,
+            params=params,
+            print_logits=False,
+        )
+        output_ids.append(predict_ids[0][0])
+
+    # detokenization
+    output = processor.tokenizer.batch_decode(
+        [output_ids], skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    assert (
+        output
+        == "The image features a man standing on the back of a yellow taxi cab, holding"
+    )
+
+
+def test_generate(model_path, tp_size):
+    workers = []
+    for tp_rank in range(tp_size):
+        proc = multiprocessing.Process(
+            target=test_generate_worker,
+            args=(
+                model_path,
+                tp_rank,
+                tp_size,
+            ),
+        )
+        proc.start()
+        workers.append(proc)
+
+    for proc in workers:
+        proc.join()
+
+
+if __name__ == "__main__":
+    test_generate("liuhaotian/llava-v1.5-7b", 1)
--- a/test/srt/test_flashinfer.py
+++ b/test/srt/test_flashinfer.py
+import flashinfer
+import pytest
+import torch
+from sglang.srt.layers.extend_attention import extend_attention_fwd
+from sglang.srt.layers.token_attention import token_attention_fwd
+
+
+@pytest.mark.parametrize("batch_size", [12, 37, 67])
+@pytest.mark.parametrize("kv_len", [54, 97])
+@pytest.mark.parametrize("qo_len", [37, 17])
+@pytest.mark.parametrize("num_kv_heads", [4])
+@pytest.mark.parametrize("num_qo_heads", [4, 32])
+@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("use_wrapper", [True, False])
+def test_batch_prefill_with_paged_kv_cache(
+    batch_size,
+    kv_len,
+    qo_len,
+    num_kv_heads,
+    num_qo_heads,
+    head_dim,
+    use_wrapper,
+):
+    q = torch.randn(batch_size * qo_len, num_qo_heads, head_dim).to(0).half()
+    q_indptr = torch.arange(0, batch_size + 1).to(0).int() * qo_len
+    total_tokens = kv_len * batch_size
+    kv_data = torch.randn(total_tokens, 2, num_kv_heads, 1, head_dim).to(0).half()
+    kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
+    kv_indices = torch.arange(0, total_tokens).to(0).int()
+    kv_last_page_len = torch.full((batch_size,), 1, dtype=torch.int32).to(0)
+
+    # init args for triton kernel
+    k_extend = (
+        kv_data.view(batch_size, kv_len, 2, -1)[:, -qo_len:, 0]
+        .contiguous()
+        .view(-1, num_kv_heads, head_dim)
+    )
+    v_extend = (
+        kv_data.view(batch_size, kv_len, 2, -1)[:, -qo_len:, 1]
+        .contiguous()
+        .view(-1, num_kv_heads, head_dim)
+    )
+    o_triton = torch.empty_like(q)
+    k_buffer = kv_data[:, 0].view(-1, num_kv_heads, head_dim).contiguous()
+    v_buffer = kv_data[:, 1].view(-1, num_kv_heads, head_dim).contiguous()
+    req_to_token = torch.arange(0, total_tokens).to(0).int().view(batch_size, kv_len)
+    b_req_idx = torch.arange(0, batch_size).to(0).int()
+    b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32).to(0)
+    b_start_loc_extend = torch.arange(0, batch_size).to(0).int() * qo_len
+    b_seq_len_extend = torch.full((batch_size,), qo_len, dtype=torch.int32).to(0)
+    max_len_in_batch = kv_len
+    max_len_extend = qo_len
+
+    extend_attention_fwd(
+        q,
+        k_extend,
+        v_extend,
+        o_triton,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        b_req_idx,
+        None,  # b_start_loc = None
+        b_seq_len,
+        None,  # b_seq_len_prefix = None
+        b_start_loc_extend,
+        b_seq_len_extend,
+        max_len_in_batch,
+        max_len_extend,
+    )
+
+    if use_wrapper:
+        wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper()
+        wrapper.begin_forward(q_indptr, batch_size, num_qo_heads, num_kv_heads)
+        o = wrapper.forward(
+            q, q_indptr, kv_data, kv_indptr, kv_indices, kv_last_page_len
+        )
+    else:
+        o = flashinfer.batch_prefill_with_paged_kv_cache(
+            q,
+            q_indptr,
+            kv_data,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_len,
+        )
+
+    print("Mean: ", torch.mean(torch.abs(o - o_triton)))
+    print("Max: ", torch.max(torch.abs(o - o_triton)))
+    assert torch.allclose(o, o_triton, rtol=1e-2, atol=1e-3)
+
+
+@pytest.mark.parametrize("batch_size", [12, 17, 37])
+@pytest.mark.parametrize("kv_len", [54, 127, 537])
+@pytest.mark.parametrize("num_kv_heads", [32])
+@pytest.mark.parametrize("num_qo_heads", [32])
+@pytest.mark.parametrize("head_dim", [128])
+def test_batch_decode_with_paged_kv_cache(
+    batch_size,
+    kv_len,
+    num_kv_heads,
+    num_qo_heads,
+    head_dim,
+):
+    # note(lsyin): when pytest, the number of heads cannot change, because triton kernel has a cache
+    # to test different shape of decode, change the parameters in the __main__, and run decode only once
+
+    q = torch.randn(batch_size, num_qo_heads, head_dim).to(0).half()
+    total_tokens = kv_len * batch_size
+    kv_data = torch.randn(total_tokens, 2, num_kv_heads, 1, head_dim).to(0).half()
+    kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
+    kv_indices = torch.arange(0, total_tokens).to(0).int()
+    kv_last_page_len = torch.full((batch_size,), 1, dtype=torch.int32).to(0)
+
+    # init args for triton kernel
+    k_buffer = kv_data[:, 0].view(-1, num_kv_heads, head_dim).contiguous()
+    v_buffer = kv_data[:, 1].view(-1, num_kv_heads, head_dim).contiguous()
+    o_triton = torch.empty_like(q)
+    req_to_token = (
+        torch.arange(0, kv_len * batch_size).to(0).int().view(batch_size, kv_len)
+    )
+    b_req_idx = torch.arange(0, batch_size).to(0).int()
+    b_start_loc = torch.arange(0, batch_size).to(0).int() * kv_len
+    b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32).to(0)
+    max_len_in_batch = kv_len
+    other_kv_index = 0
+    token_attention_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        o_triton,
+        req_to_token,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+        max_len_in_batch,
+        other_kv_index,
+        total_tokens,
+    )
+
+    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper()
+    wrapper.begin_forward(
+        kv_indptr,
+        kv_last_page_len,
+        batch_size,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim,
+        1,
+        "NONE",
+        "float16",
+    )
+    o = wrapper.forward(q, kv_data, kv_indptr, kv_indices, kv_last_page_len)
+
+    print("Mean: ", torch.mean(torch.abs(o - o_triton)))
+    print("Max: ", torch.max(torch.abs(o - o_triton)))
+    assert torch.allclose(o, o_triton, rtol=1e-2, atol=2e-3)
+
+
+if __name__ == "__main__":
+    test_batch_prefill_with_paged_kv_cache(12, 54, 37, 8, 8, 128, False)
+    test_batch_prefill_with_paged_kv_cache(37, 1111, 456, 32, 32, 128, True)
+    test_batch_decode_with_paged_kv_cache(12, 54, 4, 32, 128)