Commit 22085081 authored by Lianmin Zheng's avatar Lianmin Zheng
Browse files
parent f6d40df0
"""Sampling parameters for text generation."""
from typing import List, Optional, Union
_SAMPLING_EPS = 1e-6
class SamplingParams:
def __init__(
self,
temperature: float = 1.0,
top_p: float = 1.0,
top_k: int = -1,
frequency_penalty: float = 0.0,
presence_penalty: float = 0.0,
stop: Optional[Union[str, List[str]]] = None,
max_new_tokens: int = 16,
ignore_eos: bool = False,
skip_special_tokens: bool = True,
dtype: Optional[str] = None,
regex: Optional[str] = None,
) -> None:
self.temperature = temperature
self.top_p = top_p
self.top_k = top_k
self.frequency_penalty = frequency_penalty
self.presence_penalty = presence_penalty
self.stop_strs = stop
self.max_new_tokens = max_new_tokens
self.ignore_eos = ignore_eos
self.skip_special_tokens = skip_special_tokens
self.dtype = dtype
self.regex = regex
# Process some special cases
if self.temperature < _SAMPLING_EPS:
self.temperature = 1.0
self.top_k = 1
if self.top_k == -1:
self.top_k = 1 << 30 # whole vocabulary
if self.dtype == "int":
self.stop_strs = [" ", "\n"]
def verify(self):
if self.temperature < 0.0:
raise ValueError(
f"temperature must be non-negative, got {self.temperature}."
)
if not 0.0 < self.top_p <= 1.0:
raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
if self.top_k < -1 or self.top_k == 0:
raise ValueError(
f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
)
if not -2.0 <= self.frequency_penalty <= 2.0:
raise ValueError(
"frequency_penalty must be in [-2, 2], got "
f"{self.frequency_penalty}."
)
if not -2.0 <= self.presence_penalty <= 2.0:
raise ValueError(
"presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
)
if self.max_new_tokens < 0:
raise ValueError(
f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
)
def normalize(self, tokenizer):
# Process stop strings
if self.stop_strs is None:
self.stop_strs = []
self.stop_str_max_len = 0
else:
if isinstance(self.stop_strs, str):
self.stop_strs = [self.stop_strs]
stop_str_max_len = 0
for stop_str in self.stop_strs:
stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
self.stop_str_max_len = stop_str_max_len
"""SRT: SGLang Runtime"""
import argparse
import asyncio
import dataclasses
import json
import multiprocessing as mp
import sys
import threading
import time
from typing import List, Optional
# Fix a Python bug
setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
import psutil
import requests
import uvicorn
import uvloop
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from sglang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
from sglang.srt.managers.io_struct import GenerateReqInput
from sglang.srt.managers.openai_protocol import CompletionRequest
from sglang.srt.managers.router.manager import start_router_process
from sglang.srt.managers.tokenizer_manager import TokenizerManager
from sglang.srt.server_args import PortArgs, ServerArgs
from sglang.srt.utils import alloc_usable_network_port
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
app = FastAPI()
tokenizer_manager = None
@app.get("/get_model_info")
async def get_model_info():
result = {
"model_path": tokenizer_manager.model_path,
}
return result
@app.post("/generate")
async def generate_request(obj: GenerateReqInput):
obj.post_init()
result_generator = tokenizer_manager.generate_request(obj)
if obj.stream:
async def stream_results():
async for out in result_generator:
yield (json.dumps(out) + "\0").encode("utf-8")
return StreamingResponse(stream_results(), media_type="text/event-stream")
else:
ret = await result_generator.__anext__()
return ret
@app.post("/v1/completions")
async def v1_completions(obj: CompletionRequest):
assert obj.n == 1
obj = GenerateReqInput(
text=obj.prompt,
sampling_params={
"temperature": obj.temperature,
"max_new_tokens": obj.max_tokens,
"stop": obj.stop,
},
)
ret = await generate_request(obj)
return {
"choices": [{"text": ret["text"]}],
}
def launch_server(server_args, pipe_finish_writer):
global tokenizer_manager
# Allocate ports
can_use_ports = alloc_usable_network_port(
num=4 + server_args.tp_size, used_list=(server_args.port,)
)
port_args = PortArgs(
tokenizer_port=can_use_ports[0],
router_port=can_use_ports[1],
detokenizer_port=can_use_ports[2],
nccl_port=can_use_ports[3],
model_rpc_ports=can_use_ports[4:],
)
# Launch processes
tokenizer_manager = TokenizerManager(server_args, port_args)
pipe_router_reader, pipe_router_writer = mp.Pipe(duplex=False)
pipe_detoken_reader, pipe_detoken_writer = mp.Pipe(duplex=False)
proc_router = mp.Process(
target=start_router_process,
args=(
server_args,
port_args,
pipe_router_writer,
),
)
proc_router.start()
proc_detoken = mp.Process(
target=start_detokenizer_process,
args=(
server_args,
port_args,
pipe_detoken_writer,
),
)
proc_detoken.start()
# Wait for the model to finish loading
router_init_state = pipe_router_reader.recv()
detoken_init_state = pipe_detoken_reader.recv()
if router_init_state != "init ok" or detoken_init_state != "init ok":
proc_router.kill()
proc_detoken.kill()
print("router init state:", router_init_state)
print("detoken init state:", detoken_init_state)
sys.exit(1)
assert proc_router.is_alive() and proc_detoken.is_alive()
def launch_server():
# Launch api server
uvicorn.run(
app,
host=server_args.host,
port=server_args.port,
log_level=server_args.log_level,
timeout_keep_alive=5,
loop="uvloop",
)
t = threading.Thread(target=launch_server)
t.start()
if pipe_finish_writer:
url = server_args.url()
success = False
for i in range(60):
try:
res = requests.get(url + "/get_model_info", timeout=5)
success = True
break
except requests.exceptions.RequestException as e:
time.sleep(1)
if success:
pipe_finish_writer.send("init ok")
else:
pipe_finish_writer.send(str(e))
class Runtime:
def __init__(
self,
model_path: str,
tokenizer_path: Optional[str] = None,
load_format: str = "auto",
tokenizer_mode: str = "auto",
trust_remote_code: bool = True,
mem_fraction_static: float = 0.9,
tp_size: int = 1,
model_mode: List[str] = (),
schedule_heuristic: str = "lpm",
random_seed: int = 42,
log_level: str = "warning",
):
host = "127.0.0.1"
port = alloc_usable_network_port(1)[0]
server_args = ServerArgs(
model_path=model_path,
tokenizer_path=tokenizer_path,
host=host,
port=port,
load_format=load_format,
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code,
mem_fraction_static=mem_fraction_static,
tp_size=tp_size,
model_mode=model_mode,
schedule_heuristic=schedule_heuristic,
random_seed=random_seed,
log_level=log_level,
)
self.url = server_args.url()
self.pid = None
pipe_reader, pipe_writer = mp.Pipe(duplex=False)
proc = mp.Process(target=launch_server, args=(server_args, pipe_writer))
proc.start()
self.pid = proc.pid
init_state = pipe_reader.recv()
if init_state != "init ok":
self.shutdown()
raise RuntimeError("Launch failed")
self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self):
if self.pid is not None:
parent = psutil.Process(self.pid)
children = parent.children(recursive=True)
for child in children:
child.kill()
psutil.wait_procs(children, timeout=5)
parent.kill()
parent.wait(timeout=5)
self.pid = None
def __del__(self):
self.shutdown()
import argparse
import dataclasses
from typing import List, Optional
@dataclasses.dataclass
class ServerArgs:
model_path: str
tokenizer_path: Optional[str] = None
host: str = "127.0.0.1"
port: int = 30000
load_format: str = "auto"
tokenizer_mode: str = "auto"
trust_remote_code: bool = True
mem_fraction_static: float = 0.91
tp_size: int = 1
model_mode: List[str] = ()
schedule_heuristic: str = "lpm"
random_seed: int = 42
disable_log_stats: bool = False
log_stats_interval: int = 10
log_level: str = "info"
def __post_init__(self):
if self.tokenizer_path is None:
self.tokenizer_path = self.model_path
@staticmethod
def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--model-path",
type=str,
help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
required=True,
)
parser.add_argument(
"--tokenizer-path",
type=str,
default=ServerArgs.tokenizer_path,
help="The path of the tokenizer.",
)
parser.add_argument("--host", type=str, default=ServerArgs.host)
parser.add_argument("--port", type=int, default=ServerArgs.port)
parser.add_argument(
"--load-format",
type=str,
default=ServerArgs.load_format,
choices=["auto", "pt", "safetensors", "npcache", "dummy"],
help="The format of the model weights to load. "
'"auto" will try to load the weights in the safetensors format '
"and fall back to the pytorch bin format if safetensors format "
"is not available. "
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
"a numpy cache to speed up the loading. "
'"dummy" will initialize the weights with random values, '
"which is mainly for profiling.",
)
parser.add_argument(
"--tokenizer-mode",
type=str,
default=ServerArgs.tokenizer_mode,
choices=["auto", "slow"],
help="Tokenizer mode. 'auto' will use the fast "
"tokenizer if available, and 'slow' will "
"always use the slow tokenizer.",
)
parser.add_argument(
"--trust-remote-code",
action="store_true",
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
)
parser.add_argument(
"--mem-fraction-static",
type=float,
default=ServerArgs.mem_fraction_static,
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool)",
)
parser.add_argument(
"--tp-size",
type=int,
default=ServerArgs.tp_size,
help="Tensor parallelism degree.",
)
parser.add_argument(
"--model-mode",
type=str,
default=[],
nargs="+",
help="Model mode: [flashinfer, no-cache, aggressive-new-fill]",
)
parser.add_argument(
"--schedule-heuristic",
type=str,
default=ServerArgs.schedule_heuristic,
help="Schudule mode: [lpm, weight, random, fcfs]",
)
parser.add_argument(
"--random-seed",
type=int,
default=ServerArgs.random_seed,
help="Random seed.",
)
parser.add_argument(
"--log-level",
type=str,
default=ServerArgs.log_level,
help="Log level",
)
parser.add_argument(
"--disable-log-stats",
action="store_true",
help="Disable logging throughput stats.",
)
parser.add_argument(
"--log-stats-interval",
type=int,
default=ServerArgs.log_stats_interval,
help="Log stats interval in second.",
)
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
attrs = [attr.name for attr in dataclasses.fields(cls)]
return cls(**{attr: getattr(args, attr) for attr in attrs})
def url(self):
return f"http://{self.host}:{self.port}"
@dataclasses.dataclass
class PortArgs:
tokenizer_port: int
router_port: int
detokenizer_port: int
nccl_port: int
model_rpc_ports: List[int]
import base64
import os
import random
import socket
import sys
import time
import traceback
from io import BytesIO
import numpy as np
import requests
import torch
import torch.distributed as dist
is_show_cost_time = False
def mark_cost_time(func_name):
def inner_func(func):
def time_func(*args, **kwargs):
if dist.get_rank() in [0, 1] and is_show_cost_time:
torch.cuda.synchronize()
start_time = time.time()
ans = func(*args, **kwargs)
torch.cuda.synchronize()
print(func_name, "cost time:", (time.time() - start_time) * 1000)
return ans
else:
torch.cuda.synchronize()
ans = func(*args, **kwargs)
torch.cuda.synchronize()
return ans
return time_func
return inner_func
time_mark = {}
def mark_start(key):
torch.cuda.synchronize()
global time_mark
time_mark[key] = time.time()
return
def mark_end(key, print_min_cost=0.0):
torch.cuda.synchronize()
global time_mark
cost_time = (time.time() - time_mark[key]) * 1000
if cost_time > print_min_cost:
print(f"cost {key}:", cost_time)
def calculate_time(show=False, min_cost_ms=0.0):
def wrapper(func):
def inner_func(*args, **kwargs):
torch.cuda.synchronize()
if show:
start_time = time.time()
result = func(*args, **kwargs)
torch.cuda.synchronize()
if show:
cost_time = (time.time() - start_time) * 1000
if cost_time > min_cost_ms:
print(f"Function {func.__name__} took {cost_time} ms to run.")
return result
return inner_func
return wrapper
def set_random_seed(seed: int) -> None:
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
def alloc_usable_network_port(num, used_list=()):
port_list = []
for port in range(10000, 65536):
if port in used_list:
continue
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
try:
s.bind(("", port))
port_list.append(port)
except socket.error:
pass
if len(port_list) == num:
return port_list
return None
def get_exception_traceback():
etype, value, tb = sys.exc_info()
err_str = "".join(traceback.format_exception(etype, value, tb))
return err_str
def get_int_token_logit_bias(tokenizer, vocab_size):
from transformers import LlamaTokenizer, LlamaTokenizerFast
logit_bias = np.zeros(vocab_size, dtype=np.float32)
for t_id in range(vocab_size):
ss = tokenizer.decode(t_id).strip()
if not (ss.isdigit() or len(ss) == 0 or t_id == tokenizer.eos_token_id):
logit_bias[t_id] = -1e5
# else:
# print(ss, t_id)
return logit_bias
def wrap_kernel_launcher(kernel):
"""A faster launcher for triton kernels."""
import torch.distributed as dist
if dist.is_initialized():
rank = dist.get_rank()
else:
rank = 0
kernels = kernel.cache[rank].values()
kernel = next(iter(kernels))
# Different trition versions use different low-level names
if hasattr(kernel, "cu_function"):
kfunction = kernel.cu_function
else:
kfunction = kernel.function
if hasattr(kernel, "c_wrapper"):
run = kernel.c_wrapper
else:
run = kernel.run
add_cluster_dim = True
def ret_func(grid, num_warps, *args):
nonlocal add_cluster_dim
try:
if add_cluster_dim:
run(
grid[0],
grid[1],
grid[2],
num_warps,
1,
1,
1,
1,
kernel.shared,
0,
kfunction,
None,
None,
kernel,
*args,
)
else:
run(
grid[0],
grid[1],
grid[2],
num_warps,
kernel.shared,
0,
kfunction,
None,
None,
kernel,
*args,
)
except TypeError:
add_cluster_dim = not add_cluster_dim
ret_func(grid, num_warps, *args)
return ret_func
def is_multimodal_model(model):
if isinstance(model, str):
return "llava" in model
from sglang.srt.model_config import ModelConfig
if isinstance(model, ModelConfig):
return "llava" in model.path.lower()
raise Exception("unrecognized type")
def load_image(image_file):
from PIL import Image
image = None
if image_file.startswith("http://") or image_file.startswith("https://"):
timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
response = requests.get(image_file, timeout=timeout)
image = Image.open(BytesIO(response.content))
elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
image = Image.open(image_file)
elif image_file.startswith("data:"):
image_file = image_url.split(",")[1]
image = Image.open(BytesIO(base64.b64decode(image_file)))
else:
image = Image.open(BytesIO(base64.b64decode(image_file)))
return image
"""
This file contains the SGL programs used for unit testing.
"""
import json
import re
import sglang as sgl
def test_few_shot_qa():
@sgl.function
def few_shot_qa(s, question):
s += "The following are questions with answers.\n\n"
s += "Q: What is the capital of France?\n"
s += "A: Paris\n"
s += "Q: What is the capital of Germany?\n"
s += "A: Berlin\n"
s += "Q: What is the capital of Italy?\n"
s += "A: Rome\n"
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
ret = few_shot_qa.run(question="What is the capital of the United States?")
assert "washington" in ret["answer"].strip().lower(), f"answer: {ret['answer']}"
rets = few_shot_qa.run_batch(
[
{"question": "What is the capital of Japan?"},
{"question": "What is the capital of the United Kingdom?"},
{"question": "What is the capital city of China?"},
],
temperature=0.1,
)
answers = [x["answer"].strip().lower() for x in rets]
assert answers == ["tokyo", "london", "beijing"], f"answers: {answers}"
def test_mt_bench():
@sgl.function
def answer_mt_bench(s, question_1, question_2):
s += sgl.system("You are a helpful assistant.")
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1"))
with s.user():
s += question_2
with s.assistant():
s += sgl.gen("answer_2")
question_1 = "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions."
question_2 = (
"Rewrite your previous response. Start every sentence with the letter A."
)
ret = answer_mt_bench.run(
question_1=question_1, question_2=question_2, temperature=0.7, max_new_tokens=64
)
assert len(ret.messages()) in [4, 5]
def test_select(check_answer):
@sgl.function
def true_or_false(s, statement):
s += "Determine whether the statement below is True, False, or Unknown.\n"
s += "Statement: The capital of France is Pairs.\n"
s += "Answer: True\n"
s += "Statement: " + statement + "\n"
s += "Answer:" + sgl.select("answer", ["True", "False", "Unknown"])
ret = true_or_false.run(
statement="The capital of Germany is Berlin.",
)
if check_answer:
assert ret["answer"] == "True", ret.text
else:
assert ret["answer"] in ["True", "False", "Unknown"]
ret = true_or_false.run(
statement="The capital of Canada is Tokyo.",
)
if check_answer:
assert ret["answer"] == "False", ret.text
else:
assert ret["answer"] in ["True", "False", "Unknown"]
ret = true_or_false.run(
statement="Purple is a better color than green.",
)
if check_answer:
assert ret["answer"] == "Unknown", ret.text
else:
assert ret["answer"] in ["True", "False", "Unknown"]
def test_decode_int():
@sgl.function
def decode_int(s):
s += "The number of hours in a day is " + sgl.gen_int("hours") + "\n"
s += "The number of days in a year is " + sgl.gen_int("days") + "\n"
ret = decode_int.run(temperature=0.1)
assert int(ret["hours"]) == 24, ret.text
assert int(ret["days"]) == 365, ret.text
def test_decode_json():
@sgl.function
def decode_json(s):
s += "Generate a JSON object to describe the basic information of a city.\n"
with s.var_scope("json_output"):
s += "{\n"
s += ' "name": ' + sgl.gen_string() + ",\n"
s += ' "population": ' + sgl.gen_int() + ",\n"
s += ' "area": ' + sgl.gen(dtype=int) + ",\n"
s += ' "country": ' + sgl.gen_string() + ",\n"
s += ' "timezone": ' + sgl.gen(dtype=str) + "\n"
s += "}"
ret = decode_json.run()
js_obj = json.loads(ret["json_output"])
assert isinstance(js_obj["name"], str)
assert isinstance(js_obj["population"], int)
def test_expert_answer():
@sgl.function
def expert_answer(s, question):
s += "Question: " + question + "\n"
s += (
"A good person to answer this question is"
+ sgl.gen("expert", stop=[".", "\n"])
+ ".\n"
)
s += (
"For example,"
+ s["expert"]
+ " would answer that "
+ sgl.gen("answer", stop=".")
+ "."
)
ret = expert_answer.run(question="What is the capital of France?", temperature=0.1)
assert "paris" in ret.text().lower()
def test_tool_use():
def calculate(expression):
return f"{eval(expression)}"
@sgl.function
def tool_use(s, lhs, rhs):
s += "Please perform computations using a calculator. You can use calculate(expression) to get the results.\n"
s += "For example,\ncalculate(1+2)=3\ncalculate(3*4)=12\n"
s += "Question: What is the product of " + lhs + " and " + rhs + "?\n"
s += (
"Answer: The answer is calculate("
+ sgl.gen("expression", stop=")")
+ ") = "
)
with s.var_scope("answer"):
s += calculate(s["expression"])
lhs, rhs = 257, 983
ret = tool_use(lhs=lhs, rhs=rhs, temperature=0)
assert int(ret["answer"]) == lhs * rhs
def test_react():
@sgl.function
def react(s, question):
s += """
Question: Which country does the founder of Microsoft live in?
Thought 1: I need to search for the founder of Microsoft.
Action 1: Search [Founder of Microsoft].
Observation 1: The founder of Microsoft is Bill Gates.
Thought 2: I need to search for the country where Bill Gates lives in.
Action 2: Search [Where does Bill Gates live].
Observation 2: Bill Gates lives in the United States.
Thought 3: The answer is the United States.
Action 3: Finish [United States].\n
"""
s += "Question: " + question + "\n"
for i in range(1, 5):
s += f"Thought {i}:" + sgl.gen(stop=[".", "\n"]) + ".\n"
s += f"Action {i}: " + sgl.select(f"action_{i}", ["Search", "Finish"])
if s[f"action_{i}"] == "Search":
s += " [" + sgl.gen(stop="]") + "].\n"
s += f"Observation {i}:" + sgl.gen(stop=[".", "\n"]) + ".\n"
else:
s += " [" + sgl.gen("answer", stop="]") + "].\n"
break
ret = react.run(
question="What country does the creator of Linux live in?",
temperature=0.1,
)
answer = ret["answer"].lower()
assert "finland" in answer or "states" in answer
def test_parallel_decoding():
max_tokens = 64
number = 5
@sgl.function
def parallel_decoding(s, topic):
s += "Act as a helpful assistant.\n"
s += "USER: Give some tips for " + topic + ".\n"
s += (
"ASSISTANT: Okay. Here are "
+ str(number)
+ " concise tips, each under 8 words:\n"
)
# Generate skeleton
for i in range(1, 1 + number):
s += f"{i}." + sgl.gen(max_tokens=16, stop=[".", "\n"]) + ".\n"
# Generate detailed tips
forks = s.fork(number)
for i in range(number):
forks[
i
] += f"Now, I expand tip {i+1} into a detailed paragraph:\nTip {i+1}:"
forks[i] += sgl.gen("detailed_tip", max_tokens, stop=["\n\n"])
forks.join()
# Concatenate tips and summarize
s += "Here are these tips with detailed explanation:\n"
for i in range(number):
s += f"Tip {i+1}:" + forks[i]["detailed_tip"] + "\n"
s += "\nIn summary," + sgl.gen("summary", max_tokens=512)
ret = parallel_decoding.run(topic="writing a good blog post", temperature=0.3)
def test_parallel_encoding(check_answer=True):
max_tokens = 64
@sgl.function
def parallel_encoding(s, question, context_0, context_1, context_2):
s += "USER: I will ask a question based on some statements.\n"
s += "ASSISTANT: Sure. I will give the answer.\n"
s += "USER: Please memorize these statements.\n"
contexts = [context_0, context_1, context_2]
forks = s.fork(len(contexts))
forks += lambda i: f"Statement {i}: " + contexts[i] + "\n"
forks.join(mode="concate_and_append")
s += "Now, please answer the following question. " "Do not list options."
s += "\nQuestion: " + question + "\n"
s += "ASSISTANT:" + sgl.gen("answer", max_tokens=max_tokens)
ret = parallel_encoding.run(
question="Who is the father of Julian?",
context_0="Ethan is the father of Liam.",
context_1="Noah is the father of Julian.",
context_2="Oliver is the father of Carlos.",
temperature=0,
)
answer = ret["answer"]
if check_answer:
assert "Noah" in answer
def test_image_qa():
@sgl.function
def image_qa(s, question):
s += sgl.user(sgl.image("image.png") + question)
s += sgl.assistant(sgl.gen("answer"))
state = image_qa.run(
question="Please describe this image in simple words.",
temperature=0,
max_new_tokens=64,
)
assert "taxi" in state.messages()[-1]["content"]
def test_stream():
@sgl.function
def qa(s, question):
s += sgl.user(question)
s += sgl.assistant(sgl.gen("answer"))
ret = qa(
question="Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
stream=True,
)
out = ""
for chunk in ret.text_iter():
out += chunk
ret = qa(
question="Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.",
stream=True,
)
out = ""
for chunk in ret.text_iter("answer"):
out += chunk
def test_regex():
regex = r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
@sgl.function
def regex_gen(s):
s += "Q: What is the IP address of the Google DNS servers?\n"
s += "A: " + sgl.gen(
"answer",
temperature=0,
regex=regex,
)
state = regex_gen.run()
answer = state["answer"]
assert re.match(regex, answer)
"""Common utilities for testing and benchmarking"""
import numpy as np
import requests
from sglang.backend.openai import OpenAI
from sglang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.global_config import global_config
def call_generate_lightllm(prompt, temperature, max_tokens, stop, url):
data = {
"inputs": prompt,
"parameters": {
"temperature": temperature,
"max_new_tokens": max_tokens,
"stop_sequences": stop,
},
}
res = requests.post(url, json=data)
assert res.status_code == 200
pred = res.json()["generated_text"][0]
return pred
def call_generate_vllm(prompt, temperature, max_tokens, stop, url, n=1):
data = {
"prompt": prompt,
"temperature": temperature,
"max_tokens": max_tokens,
"stop": stop,
"n": n,
}
res = requests.post(url, json=data)
assert res.status_code == 200
if n == 1:
pred = res.json()["text"][0][len(prompt) :]
else:
pred = [x[len(prompt) :] for x in res.json()["text"]]
return pred
def call_generate_srt_raw(prompt, temperature, max_tokens, stop, url):
data = {
"text": prompt,
"sampling_params": {
"temperature": temperature,
"max_new_tokens": max_tokens,
"stop": stop,
},
}
res = requests.post(url, json=data)
assert res.status_code == 200
obj = res.json()
pred = obj["text"]
return pred
def call_select_lightllm(context, choices, url):
scores = []
for i in range(len(choices)):
data = {
"inputs": context + choices[i],
"parameters": {
"max_new_tokens": 1,
},
}
res = requests.post(url, json=data)
assert res.status_code == 200
scores.append(0)
return np.argmax(scores)
def call_select_vllm(context, choices, url):
scores = []
for i in range(len(choices)):
data = {
"prompt": context + choices[i],
"max_tokens": 1,
"prompt_logprobs": 1,
}
res = requests.post(url, json=data)
assert res.status_code == 200
scores.append(res.json()["prompt_score"])
return np.argmax(scores)
"""
Modify vllm/entrypoints/api_server.py
if final_output.prompt_logprobs is not None:
score = np.mean([prob[t_id] for t_id, prob in zip(final_output.prompt_token_ids[1:], final_output.prompt_logprobs[1:])])
ret["prompt_score"] = score
"""
def add_common_other_args_and_parse(parser):
parser.add_argument("--parallel", type=int, default=96)
parser.add_argument("--host", type=str, default="http://127.0.0.1")
parser.add_argument("--port", type=int, default=None)
parser.add_argument(
"--backend",
type=str,
required=True,
choices=["vllm", "lightllm", "guidance", "lmql", "srt-raw", "llama.cpp"],
)
parser.add_argument(
"--model-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
)
parser.add_argument("--result-file", type=str, default="result.jsonl")
args = parser.parse_args()
if args.port is None:
default_port = {
"vllm": 21000,
"lightllm": 22000,
"lmql": 23000,
"srt-raw": 30000,
}
args.port = default_port.get(args.backend, None)
return args
def add_common_sglang_args_and_parse(parser):
parser.add_argument("--parallel", type=int, default=64)
parser.add_argument("--host", type=str, default="http://127.0.0.1")
parser.add_argument("--port", type=int, default=30000)
parser.add_argument("--backend", type=str, default="srt")
parser.add_argument("--result-file", type=str, default="result.jsonl")
args = parser.parse_args()
return args
def select_sglang_backend(args):
if args.backend.startswith("srt"):
if args.backend == "srt-no-parallel":
global_config.enable_parallel_decoding = False
global_config.enable_parallel_encoding = False
backend = RuntimeEndpoint(f"{args.host}:{args.port}")
elif args.backend.startswith("gpt"):
backend = OpenAI(args.backend)
else:
raise ValueError(f"Invalid backend: {args.backend}")
return backend
"""Common utilities."""
import base64
import json
import threading
import urllib.request
from io import BytesIO
from json import dumps
import requests
def get_available_gpu_memory(gpu_id, distributed=True):
"""
Get available memory for cuda:gpu_id device.
When distributed is True, the available memory is the minimum available memory of all GPUs.
"""
import torch
num_gpus = torch.cuda.device_count()
assert gpu_id < num_gpus
if torch.cuda.current_device() != gpu_id:
print(
f"WARN: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
"which may cause useless memory allocation for torch CUDA context.",
)
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
if distributed:
tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
torch.device("cuda", gpu_id)
)
torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
free_gpu_memory = tensor.item()
return free_gpu_memory / (1 << 30)
def is_same_type(values):
"""Return whether the elements in values are of the same type."""
if len(values) <= 1:
return True
else:
t = type(values[0])
return all(isinstance(v, t) for v in values[1:])
def read_jsonl(filename: str):
"""Read a JSONL file."""
rets = []
with open(filename) as fin:
for line in fin:
if line.startswith("#"):
continue
rets.append(json.loads(line))
return rets
def dump_state_text(filename, states, mode="w"):
"""Dump program state in a text file."""
from sglang.lang.interpreter import ProgramState
with open(filename, mode) as fout:
for i, s in enumerate(states):
if isinstance(s, str):
pass
elif isinstance(s, ProgramState):
s = s.text().strip()
else:
s = str(s)
fout.write(
"=" * 40 + f" {i} " + "=" * 40 + "\n" + s + "\n" + "=" * 80 + "\n\n"
)
class HttpResponse:
def __init__(self, resp):
self.resp = resp
def json(self):
return json.loads(self.resp.read())
@property
def status_code(self):
return self.resp.status
def http_request(url, json=None, stream=False):
"""A faster version of requests.post with low-level urllib API."""
if stream:
return requests.post(url, json=json, stream=True)
else:
req = urllib.request.Request(url)
req.add_header("Content-Type", "application/json; charset=utf-8")
if json is None:
data = None
else:
data = bytes(dumps(json), encoding="utf-8")
resp = urllib.request.urlopen(req, data=data)
return HttpResponse(resp)
def encode_image_base64(image_path):
"""Encode an image in base64."""
if isinstance(image_path, str):
with open(image_path, "rb") as image_file:
data = image_file.read()
return base64.b64encode(data).decode("utf-8")
elif isinstance(image_path, bytes):
return base64.b64encode(image_path).decode("utf-8")
else:
# image_path is PIL.WebPImagePlugin.WebPImageFile
image = image_path
buffered = BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def _is_chinese_char(cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def find_printable_text(text):
"""Returns the longest printable substring of text that contains only entire words."""
# Borrowed from https://github.com/huggingface/transformers/blob/061580c82c2db1de9139528243e105953793f7a2/src/transformers/generation/streamers.py#L99
# After the symbol for a new line, we flush the cache.
if text.endswith("\n"):
return text
# If the last token is a CJK character, we print the characters.
elif len(text) > 0 and _is_chinese_char(ord(text[-1])):
return text
# Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
# which may change with the subsequent token -- there are probably smarter ways to do this!)
else:
return text[: text.rfind(" ") + 1]
def run_with_timeout(func, args=(), kwargs=None, timeout=None):
"""Run a function with timeout."""
ret_value = []
def _target_func():
ret_value.append(func(*args, **(kwargs or {})))
t = threading.Thread(target=_target_func)
t.start()
t.join(timeout=timeout)
if t.is_alive():
raise TimeoutError()
if not ret_value:
raise RuntimeError()
return ret_value[0]
kill -9 $(ps aux | grep 'python' | grep -v 'grep' | awk '{print $2}')
import argparse
import glob
import multiprocessing
import os
import time
import unittest
from sglang.utils import run_with_timeout
def run_unittest_files(files, args):
for filename in files:
def func():
print(filename)
ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
p = multiprocessing.Process(target=func)
def run_one_file():
p.start()
p.join()
try:
run_with_timeout(run_one_file, timeout=args.time_limit_per_file)
if p.exitcode != 0:
return False
except TimeoutError:
p.terminate()
time.sleep(5)
print(
f"\nTimeout after {args.time_limit_per_file} seconds "
f"when running {filename}"
)
return False
return True
if __name__ == "__main__":
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument(
"--time-limit-per-file",
type=int,
default=1000,
help="The time limit for running one file in seconds.",
)
args = arg_parser.parse_args()
files = glob.glob("**/test_*.py", recursive=True)
tic = time.time()
success = run_unittest_files(files, args)
if success:
print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
else:
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
exit(0 if success else -1)
import json
import unittest
from sglang.test.test_programs import test_mt_bench, test_stream
from sglang import Anthropic, set_default_backend
class TestAnthropicBackend(unittest.TestCase):
backend = None
chat_backend = None
def setUp(self):
cls = type(self)
if cls.backend is None:
cls.backend = Anthropic("claude-2")
set_default_backend(cls.backend)
def test_mt_bench(self):
test_mt_bench()
def test_stream(self):
test_stream()
if __name__ == "__main__":
unittest.main(warnings="ignore")
# from sglang.global_config import global_config
# global_config.verbosity = 2
# t = TestAnthropicBackend()
# t.setUp()
# t.test_mt_bench()
import unittest
from sglang.backend.runtime_endpoint import RuntimeEndpoint
import sglang as sgl
class TestBind(unittest.TestCase):
backend = None
def setUp(self):
cls = type(self)
if cls.backend is None:
cls.backend = RuntimeEndpoint(base_url="http://localhost:30000")
def test_bind(self):
@sgl.function
def few_shot_qa(s, prompt, question):
s += prompt
s += "Q: What is the capital of France?\n"
s += "A: Paris\n"
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", stop="\n")
few_shot_qa_2 = few_shot_qa.bind(
prompt="The following are questions with answers.\n\n"
)
tracer = few_shot_qa_2.trace()
print(tracer.last_node.print_graph_dfs() + "\n")
def test_pin(self):
@sgl.function
def few_shot_qa(s, prompt, question):
s += prompt
s += "Q: What is the capital of France?\n"
s += "A: Paris\n"
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", stop="\n")
few_shot_qa_2 = few_shot_qa.bind(
prompt="Answer the following questions as if you were a 5-year-old kid.\n\n"
)
few_shot_qa_2.pin(self.backend)
few_shot_qa_2.unpin(self.backend)
if __name__ == "__main__":
unittest.main(warnings="ignore")
# t = TestBind()
# t.setUp()
# t.test_pin()
import unittest
from sglang.test.test_programs import (
test_decode_int,
test_decode_json,
test_expert_answer,
test_few_shot_qa,
test_image_qa,
test_mt_bench,
test_parallel_decoding,
test_parallel_encoding,
test_react,
test_select,
test_stream,
test_tool_use,
)
from sglang import OpenAI, set_default_backend
class TestOpenAIBackend(unittest.TestCase):
backend = None
chat_backend = None
chat_vision_backend = None
def setUp(self):
cls = type(self)
if cls.backend is None:
cls.backend = OpenAI("gpt-3.5-turbo-instruct")
cls.chat_backend = OpenAI("gpt-3.5-turbo")
cls.chat_vision_backend = OpenAI("gpt-4-vision-preview")
def test_few_shot_qa(self):
set_default_backend(self.backend)
test_few_shot_qa()
def test_mt_bench(self):
set_default_backend(self.chat_backend)
test_mt_bench()
def test_select(self):
set_default_backend(self.backend)
test_select(check_answer=True)
def test_decode_int(self):
set_default_backend(self.backend)
test_decode_int()
def test_decode_json(self):
set_default_backend(self.backend)
test_decode_json()
def test_expert_answer(self):
set_default_backend(self.backend)
test_expert_answer()
def test_tool_use(self):
set_default_backend(self.backend)
test_tool_use()
def test_react(self):
set_default_backend(self.backend)
test_react()
def test_parallel_decoding(self):
set_default_backend(self.backend)
test_parallel_decoding()
def test_parallel_encoding(self):
set_default_backend(self.backend)
test_parallel_encoding()
def test_image_qa(self):
set_default_backend(self.chat_vision_backend)
test_image_qa()
def test_stream(self):
set_default_backend(self.backend)
test_stream()
if __name__ == "__main__":
unittest.main(warnings="ignore")
# from sglang.global_config import global_config
# global_config.verbosity = 2
# t = TestOpenAIBackend()
# t.setUp()
# t.test_decode_json()
"""
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
"""
import json
import unittest
from sglang.test.test_programs import (
test_decode_int,
test_decode_json,
test_expert_answer,
test_few_shot_qa,
test_mt_bench,
test_parallel_decoding,
test_parallel_encoding,
test_react,
test_regex,
test_select,
test_stream,
test_tool_use,
)
import sglang as sgl
class TestSRTBackend(unittest.TestCase):
backend = None
def setUp(self):
cls = type(self)
if cls.backend is None:
cls.backend = sgl.RuntimeEndpoint(base_url="http://localhost:30000")
sgl.set_default_backend(cls.backend)
def test_few_shot_qa(self):
test_few_shot_qa()
def test_mt_bench(self):
test_mt_bench()
def test_select(self):
test_select(check_answer=False)
def test_decode_int(self):
test_decode_int()
def test_expert_answer(self):
test_expert_answer()
def test_tool_use(self):
test_tool_use()
def test_parallel_decoding(self):
test_parallel_decoding()
def test_stream(self):
test_stream()
def test_regex(self):
test_regex()
# def test_parallel_encoding(self):
# test_parallel_encoding(check_answer=False)
if __name__ == "__main__":
unittest.main(warnings="ignore")
# from sglang.global_config import global_config
# global_config.verbosity = 2
# t = TestSRTBackend()
# t.setUp()
# t.test_regex()
import unittest
from sglang.backend.base_backend import BaseBackend
from sglang.lang.chat_template import get_chat_template
import sglang as sgl
class TestTracing(unittest.TestCase):
def test_few_shot_qa(self):
@sgl.function
def few_shot_qa(s, question):
s += "The following are questions with answers.\n\n"
s += "Q: What is the capital of France?\n"
s += "A: Paris\n"
s += "Q: " + question + "\n"
s += "A:" + sgl.gen("answer", stop="\n")
tracer = few_shot_qa.trace()
print(tracer.last_node.print_graph_dfs() + "\n")
def test_select(self):
@sgl.function
def capital(s):
s += "The capital of France is"
s += sgl.select("capital", ["Paris. ", "London. "])
s += "It is a city" + sgl.gen("description", stop=".")
tracer = capital.trace()
print(tracer.last_node.print_graph_dfs() + "\n")
def test_raise_warning(self):
@sgl.function
def wrong(s, question):
s += f"I want to ask {question}"
try:
tracer = wrong.trace()
raised = False
except TypeError:
raised = True
assert raised
def test_multi_function(self):
@sgl.function
def expand(s, tip):
s += (
"Please expand the following tip into a detailed paragraph:"
+ tip
+ "\n"
)
s += sgl.gen("detailed_tip")
@sgl.function
def tip_suggestion(s, topic):
s += "Here are 2 tips for " + topic + ".\n"
s += "1." + sgl.gen("tip_1", stop=["\n", ":", "."]) + "\n"
s += "2." + sgl.gen("tip_2", stop=["\n", ":", "."]) + "\n"
branch1 = expand(tip=s["tip_1"])
branch2 = expand(tip=s["tip_2"])
s += "Tip 1: " + branch1["detailed_tip"] + "\n"
s += "Tip 2: " + branch2["detailed_tip"] + "\n"
s += "In summary" + sgl.gen("summary")
compiled = tip_suggestion.compile()
compiled.print_graph()
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
state = compiled.run(topic="staying healthy")
print(state.text() + "\n")
states = compiled.run_batch(
[
{"topic": "staying healthy"},
{"topic": "staying happy"},
{"topic": "earning money"},
],
temperature=0,
)
for s in states:
print(s.text() + "\n")
def test_role(self):
@sgl.function
def multi_turn_chat(s):
s += sgl.user("Who are you?")
s += sgl.assistant(sgl.gen("answer_1"))
s += sgl.user("Who created you?")
s += sgl.assistant(sgl.gen("answer_2"))
backend = BaseBackend()
backend.chat_template = get_chat_template("llama-2-chat")
compiled = multi_turn_chat.compile(backend=backend)
compiled.print_graph()
def test_fork(self):
@sgl.function
def tip_suggestion(s):
s += (
"Here are three tips for staying healthy: "
"1. Balanced Diet; "
"2. Regular Exercise; "
"3. Adequate Sleep\n"
)
forks = s.fork(3)
for i in range(3):
forks[i] += f"Now, expand tip {i+1} into a paragraph:\n"
forks[i] += sgl.gen(f"detailed_tip")
s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
s += "Tip 3:" + forks[2]["detailed_tip"] + "\n"
s += "In summary" + sgl.gen("summary")
tracer = tip_suggestion.trace()
print(tracer.last_node.print_graph_dfs())
a = tip_suggestion.run(backend=sgl.OpenAI("gpt-3.5-turbo-instruct"))
print(a.text())
if __name__ == "__main__":
unittest.main(warnings="ignore")
# t = TestTracing()
# t.test_fork()
import multiprocessing as mp
import time
from dataclasses import dataclass
import torch
import torch.distributed as dist
from sglang.srt.managers.router.model_runner import ModelRunner
from sglang.srt.model_config import ModelConfig
@dataclass
class BenchBatch:
req_to_token_pool: torch.Tensor
token_to_kv_pool: torch.Tensor
input_ids: torch.Tensor = None
position_ids_offsets: torch.Tensor = None
seq_lens: torch.Tensor = None
prefix_lens: torch.Tensor = None
req_pool_indices: torch.Tensor = None
out_cache_loc: torch.Tensor = None
out_cache_cont_start: torch.Tensor = None
out_cache_cont_end: torch.Tensor = None
def __init__(self, model_runner: ModelRunner):
self.req_to_token_pool = model_runner.req_to_token_pool
self.token_to_kv_pool = model_runner.token_to_kv_pool
def init_prefill_batch(self, input_ids, batch_size, seq_len):
self.input_ids = input_ids
self.position_ids_offsets = torch.zeros(
batch_size, dtype=torch.int32, device="cuda"
)
self.seq_lens = torch.full(
(batch_size,), seq_len, dtype=torch.int32, device="cuda"
)
self.prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
self.req_pool_indices = self.req_to_token_pool.alloc(batch_size)
self.out_cache_loc = self.token_to_kv_pool.alloc(batch_size * seq_len)
for i in range(batch_size):
n_idx = self.req_pool_indices[i].item()
self.req_to_token_pool.req_to_token[n_idx, :seq_len] = self.out_cache_loc[
i * seq_len : (i + 1) * seq_len
]
def update_extend(
self, input_ids, batch_size, prefix_len, extend_len, prefix_req_idx
):
self.input_ids = input_ids
self.position_ids_offsets = torch.zeros(
batch_size, dtype=torch.int32, device="cuda"
)
self.seq_lens = torch.full(
(batch_size,), prefix_len + extend_len, dtype=torch.int32, device="cuda"
)
self.prefix_lens = torch.full(
(batch_size,), prefix_len, dtype=torch.int32, device="cuda"
)
self.req_pool_indices = self.req_to_token_pool.alloc(batch_size)
self.out_cache_loc = self.token_to_kv_pool.alloc(batch_size * extend_len)
req_to_token = self.req_to_token_pool.req_to_token
fork_num = batch_size // prefix_req_idx.shape[0]
for i in range(batch_size):
p_idx = prefix_req_idx[i // fork_num].item()
n_idx = self.req_pool_indices[i].item()
req_to_token[n_idx, :prefix_len] = req_to_token[p_idx, :prefix_len]
req_to_token[
n_idx, prefix_len : prefix_len + extend_len
] = self.out_cache_loc[i * extend_len : (i + 1) * extend_len]
def update_decode(self, predict_ids, batch_size):
assert predict_ids.shape[0] == batch_size
assert batch_size == self.req_pool_indices.shape[0]
self.input_ids = predict_ids.reshape(-1)
self.prefix_lens = None
(
self.out_cache_loc,
self.out_cache_cont_start,
self.out_cache_cont_end,
) = self.token_to_kv_pool.alloc_contiguous(batch_size)
self.req_to_token_pool.req_to_token[
self.req_pool_indices, self.seq_lens
] = self.out_cache_loc
self.seq_lens.add_(1)
def prefill(model_runner: ModelRunner, batch: BenchBatch):
logits, _ = model_runner.forward_extend(
batch.input_ids,
batch.req_pool_indices,
batch.seq_lens,
batch.prefix_lens,
batch.position_ids_offsets,
batch.out_cache_loc,
False,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
return predict_ids
def extend(model_runner: ModelRunner, batch: BenchBatch):
logits, _ = model_runner.forward_extend(
batch.input_ids,
batch.req_pool_indices,
batch.seq_lens,
batch.prefix_lens,
batch.position_ids_offsets,
batch.out_cache_loc,
True,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
return predict_ids
def decode(model_runner: ModelRunner, batch: BenchBatch):
logits = model_runner.forward_decode(
batch.input_ids,
batch.req_pool_indices,
batch.seq_lens,
None,
batch.position_ids_offsets,
None,
batch.out_cache_cont_start,
batch.out_cache_cont_end,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
return predict_ids
def bench_generate_worker(
model_path,
tp_rank,
tp_size,
shared_num,
unique_num,
shared_len,
unique_len,
decode_len,
model_mode,
):
assert unique_num % shared_num == 0
model_config = ModelConfig(path=model_path)
model_runner = ModelRunner(
model_config=model_config,
mem_fraction_static=0.8,
tp_rank=tp_rank,
tp_size=tp_size,
nccl_port=28888,
model_mode=model_mode,
)
batch = BenchBatch(model_runner)
# warm up
for _ in range(1):
input_ids = torch.randint(
low=5, high=100, size=(shared_num * shared_len,)
).cuda()
batch.init_prefill_batch(input_ids, shared_num, shared_len)
_ = prefill(model_runner, batch)
input_ids = torch.randint(
low=5, high=100, size=(unique_num * unique_len,)
).cuda()
batch.update_extend(
input_ids, unique_num, shared_len, unique_len, batch.req_pool_indices
)
predict_ids = extend(model_runner, batch)
for i in range(decode_len):
predict_ids = torch.from_numpy(predict_ids).cuda()
batch.update_decode(predict_ids, unique_num)
predict_ids = decode(model_runner, batch)
model_runner.req_to_token_pool.clear()
model_runner.token_to_kv_pool.clear()
if tp_size > 1:
dist.barrier()
prefill_start = time.time()
input_ids = torch.randint(low=5, high=100, size=(shared_num * shared_len,)).cuda()
batch.init_prefill_batch(input_ids, shared_num, shared_len)
_ = prefill(model_runner, batch)
if tp_rank == 0:
print(f"prefill: {(time.time() - prefill_start) * 1000:.2f} ms")
extend_start = time.time()
input_ids = torch.randint(low=5, high=100, size=(unique_num * unique_len,)).cuda()
batch.update_extend(
input_ids, unique_num, shared_len, unique_len, batch.req_pool_indices
)
predict_ids = extend(model_runner, batch)
if tp_rank == 0:
print(f"extend: {(time.time() - extend_start) * 1000:.2f} ms")
for i in range(decode_len):
decode_start = time.time()
predict_ids = torch.from_numpy(predict_ids).cuda()
batch.update_decode(predict_ids, unique_num)
predict_ids = decode(model_runner, batch)
if tp_rank == 0:
print(f"decode {i}: {(time.time() - decode_start) * 1000:.2f} ms")
def bench_generate(
model_path,
tp_size,
shared_num,
unique_num,
shared_len,
unique_len,
decode_len,
model_mode,
):
print(
f"tp_size: {tp_size}, "
f"shared_num: {shared_num}, "
f"unique_num: {unique_num}, "
f"shared_len: {shared_len}, "
f"unique_len: {unique_len}, "
f"decode_len: {decode_len}, "
f"model_mode: {model_mode}"
)
workers = []
for tp_rank in range(tp_size):
proc = mp.Process(
target=bench_generate_worker,
args=(
model_path,
tp_rank,
tp_size,
shared_num,
unique_num,
shared_len,
unique_len,
decode_len,
model_mode,
),
)
proc.start()
workers.append(proc)
for proc in workers:
proc.join()
if __name__ == "__main__":
bench_generate(
model_path="meta-llama/Llama-2-7b-chat-hf",
tp_size=1,
shared_num=1,
unique_num=32,
shared_len=256,
unique_len=256,
decode_len=8,
model_mode=[],
)
import argparse
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
@torch.inference_mode()
def normal_text(args):
t = AutoTokenizer.from_pretrained(args.model_path)
m = AutoModelForCausalLM.from_pretrained(
args.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
)
m.cuda()
print(m)
prompts = [
"The capital of France is",
"The capital of the United Kindom is",
"Today is a sunny day and I like",
]
max_new_tokens = 32
for p in prompts:
if isinstance(p, str):
input_ids = t.encode(p, return_tensors="pt").cuda()
else:
input_ids = torch.tensor([p], device="cuda")
output_ids = m.generate(
input_ids, do_sample=False, max_new_tokens=max_new_tokens
)
output_str = t.decode(output_ids[0])
print(output_str)
prefill_logits = m.forward(input_ids).logits[0][-1]
print("prefill logits", prefill_logits)
@torch.inference_mode()
def synthetic_tokens(args):
t = AutoTokenizer.from_pretrained(args.model_path)
m = AutoModelForCausalLM.from_pretrained(
args.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
)
m.cuda()
print(m)
input_len = 256
output_len = 8
prompts = [list(range(5, 5 + input_len))]
for p in prompts:
input_ids = p
for i in range(output_len + 1):
prefill_logits = m.forward(torch.tensor([input_ids], device="cuda")).logits[
0
][-1]
if i == 0:
print("prefill logits", prefill_logits)
else:
print("decode", i - 1, prefill_logits)
input_ids.append(torch.argmax(prefill_logits).item())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model-path",
type=str,
default="TinyLlama/TinyLlama-1.1B-Chat-v0.4",
# default="meta-llama/Llama-2-7b-chat-hf",
)
args = parser.parse_args()
normal_text(args)
# synthetic_tokens(args)
import multiprocessing
import os
import time
import numpy as np
import torch
import torch.distributed as dist
import transformers
from sglang.srt.managers.router.infer_batch import Batch, ForwardMode, Req
from sglang.srt.managers.router.model_runner import ModelRunner
from sglang.srt.model_config import ModelConfig
from sglang.srt.sampling_params import SamplingParams
def test_generate_worker(model_path, tp_rank, tp_size):
model_config = ModelConfig(path=model_path)
model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
# Input
prompts = [
"The capital of France is",
"Today is a sunny day and I like",
]
sampling_params = SamplingParams(temperature=0)
cut_num = 4
reqs = []
for i in range(len(prompts)):
req = Req(i)
req.input_ids = tokenizer.encode(prompts[i])[:cut_num]
req.sampling_params = sampling_params
reqs.append(req)
# Prefill
batch = Batch(reqs, model.req_to_token_pool, model.token_to_kv_pool, None)
batch.init_extend_batch(model.model_config.vocab_size(), None)
logits, _ = model.forward(batch, ForwardMode.EXTEND)
next_token_ids, next_token_probs = batch.sample(logits)
print("extend logits (first)", logits)
# Extend
for i in range(len(prompts)):
req = reqs[i]
req.input_ids += tokenizer.encode(prompts[i])[cut_num:]
req.prefix_indices = model.req_to_token_pool.req_to_token[
batch.req_pool_indices[i], :cut_num
]
batch = Batch(reqs, model.req_to_token_pool, model.token_to_kv_pool, None)
batch.init_extend_batch(model.model_config.vocab_size(), None)
logits, _ = model.forward(batch, ForwardMode.EXTEND)
next_token_ids, next_token_probs = batch.sample(logits)
print("extend logits", logits)
print(
"next_token_ids", next_token_ids, [tokenizer.decode(x) for x in next_token_ids]
)
# Decode
for i in range(6):
batch.update_for_decode(next_token_ids.cpu().numpy())
logits = model.forward(batch, ForwardMode.DECODE)
next_token_ids, next_token_probs = batch.sample(logits)
print(
"next_token_ids",
next_token_ids,
[tokenizer.decode(x) for x in next_token_ids],
)
def test_generate(model_path, tp_size):
workers = []
for tp_rank in range(tp_size):
proc = multiprocessing.Process(
target=test_generate_worker,
args=(
model_path,
tp_rank,
tp_size,
),
)
proc.start()
workers.append(proc)
for proc in workers:
proc.join()
if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
test_generate("TinyLlama/TinyLlama-1.1B-Chat-v0.4", 1)
# Reference output for TinyLlama-1.1B-Chat-v0.4
# extend logits (first) tensor([[-10.0312, -9.5000, 0.8896, ..., -4.9375, -3.2402, -3.3633],
# [ -9.1797, -10.2500, 2.7168, ..., -4.3359, -4.0664, -4.1289]],
# device='cuda:0', dtype=torch.float16)
# extend logits tensor([[-8.3125, -7.1172, 3.3359, ..., -4.9531, -4.1289, -3.4121],
# [-9.6406, -9.0547, 4.0195, ..., -5.3086, -4.7188, -4.4609]],
# device='cuda:0', dtype=torch.float16)
# next_token_ids tensor([3681, 304], device='cuda:0') ['Paris', 'to']
# next_token_ids tensor([29889, 748], device='cuda:0') ['.', 'go']
# next_token_ids tensor([ 13, 363], device='cuda:0') ['\n', 'for']
# next_token_ids tensor([1576, 263], device='cuda:0') ['The', 'a']
# next_token_ids tensor([7483, 6686], device='cuda:0') ['capital', 'walk']
# next_token_ids tensor([310, 297], device='cuda:0') ['of', 'in']
# next_token_ids tensor([278, 278], device='cuda:0') ['the', 'the']
import multiprocessing
import time
import numpy as np
import torch
import torch.distributed as dist
from sglang.srt.managers.router.model_runner import ModelRunner
from sglang.srt.model_config import ModelConfig
def test_generate_worker(
model_path, tp_rank, tp_size, batch_size, input_len, output_len
):
model_config = ModelConfig(path=model_path)
model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
# Prepare data
input_ids = np.vstack([np.arange(5, input_len + 5) for _ in range(batch_size)])
input_ids = input_ids.reshape(-1)
input_ids = torch.tensor(input_ids).cuda()
def init_batch_data(model, batch_size, input_len):
req_pool_indices = model.req_to_token_pool.alloc(batch_size)
seq_lens = torch.full(
(batch_size,), input_len, dtype=torch.int32, device="cuda"
)
prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
position_ids_offsets = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
out_cache_loc = model.token_to_kv_pool.alloc(batch_size * input_len)
for i in range(batch_size):
req_idx = req_pool_indices[i].item()
model.req_to_token_pool.req_to_token[req_idx, :input_len] = out_cache_loc[
i * input_len : (i + 1) * input_len
]
return (
req_pool_indices,
seq_lens,
prefix_lens,
position_ids_offsets,
out_cache_loc,
)
def prefill(print_logits):
nonlocal predict_ids
logits, _ = model.forward_prefill(
input_ids,
req_pool_indices,
seq_lens,
prefix_lens,
position_ids_offsets,
out_cache_loc,
False,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
if print_logits and tp_rank == 0:
print("prefill logits", logits, logits.shape)
def decode(print_logits):
nonlocal predict_ids
(
out_cache_loc,
out_cache_cont_start,
out_cache_cont_end,
) = model.token_to_kv_pool.alloc_contiguous(batch_size)
model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
seq_lens.add_(1)
logits = model.forward_decode(
torch.from_numpy(predict_ids).cuda().reshape(-1),
req_pool_indices,
seq_lens,
None,
position_ids_offsets,
None,
out_cache_cont_start,
out_cache_cont_end,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
if print_logits and tp_rank == 0:
print("decode", i, logits)
# Warm up
(
req_pool_indices,
seq_lens,
prefix_lens,
position_ids_offsets,
out_cache_loc,
) = init_batch_data(model, batch_size, input_len)
predict_ids = None
prefill(True)
for i in range(output_len):
decode(True)
for i in range(batch_size):
req_idx = req_pool_indices[i].item()
model.token_to_kv_pool.free(
model.req_to_token_pool.req_to_token[req_idx, : seq_lens[i]]
)
model.req_to_token_pool.free(req_pool_indices)
# Benchmark
if tp_size > 1:
dist.barrier()
start_time = prefill_start_time = time.time()
(
req_pool_indices,
seq_lens,
prefix_lens,
position_ids_offsets,
out_cache_loc,
) = init_batch_data(model, batch_size, input_len)
prefill(False)
if tp_rank == 0:
print(f"prefill cost: {(time.time() - prefill_start_time) * 1000:.2f} ms")
for i in range(output_len):
step_start = time.time()
decode(False)
step_end = time.time()
if i % 100 == 0 or i == output_len - 1:
if tp_rank == 0:
print(f"step {i} cost: {(step_end - step_start) * 1000:.2f} ms")
end_time = time.time()
if tp_rank == 0:
print(f"total cost: {(end_time - start_time) * 1000:.2f}")
def test_generate(model_path, tp_size, batch_size, input_len, output_len):
workers = []
for tp_rank in range(tp_size):
proc = multiprocessing.Process(
target=test_generate_worker,
args=(
model_path,
tp_rank,
tp_size,
batch_size,
input_len,
output_len,
),
)
proc.start()
workers.append(proc)
for proc in workers:
proc.join()
if __name__ == "__main__":
test_generate("TinyLlama/TinyLlama-1.1B-Chat-v0.4", 1, 1, 256, 8)
# test_generate("meta-llama/Llama-2-7b-chat-hf", 1, 16, 256, 8)
# Reference output for TinyLlama-1.1B-Chat-v0.4 (1, 32, 8)
# prefill logits tensor([[-1.3380e-03, 4.4702e-01, 2.9082e+00, ..., -1.8398e+00,
# 1.8281e+00, 2.1816e+00]], device='cuda:0')
# decode 0 tensor([[-0.3904, 0.8784, 3.6934, ..., -2.4473, 1.5811, 2.0098]],
# device='cuda:0')
# decode 1 tensor([[-0.3552, 0.0635, 2.5781, ..., -2.5820, 1.3047, 1.7607]],
# device='cuda:0')
# decode 2 tensor([[-1.5645, -1.1963, 3.8145, ..., -2.9766, 1.0244, 1.0645]],
# device='cuda:0')
# decode 3 tensor([[-1.3682, -0.6548, 4.2734, ..., -2.8711, 1.1172, 1.1494]],
# device='cuda:0')
# decode 4 tensor([[-1.0205, -0.0060, 4.4844, ..., -2.7090, 1.6143, 1.8135]],
# device='cuda:0')
# decode 5 tensor([[ 0.4260, 1.6006, 4.3633, ..., -2.2480, 2.5547, 2.8379]],
# device='cuda:0')
# decode 6 tensor([[ 0.7095, 2.1816, 5.0078, ..., -2.1309, 3.0293, 3.0840]],
# device='cuda:0')
# decode 7 tensor([[-0.2883, 1.1289, 4.7188, ..., -2.4023, 2.1055, 2.1836]],
# device='cuda:0')
# Reference output for TinyLlama-1.1B-Chat-v0.4 (1, 256, 8)
# prefill logits tensor([[-2.5840, -2.7227, 6.8047, ..., -2.3613, 0.1224, 0.5952]],
# device='cuda:0')
# decode 0 tensor([[-0.6235, -0.7690, 9.2891, ..., -1.4922, 2.8008, 2.9531]],
# device='cuda:0')
# decode 1 tensor([[-1.3662, -1.4648, 7.1250, ..., -1.7861, 1.7363, 1.8857]],
# device='cuda:0')
# decode 2 tensor([[-0.8540, -0.5947, 9.1328, ..., -2.1211, 2.9707, 2.8945]],
# device='cuda:0')
# decode 3 tensor([[ 0.0652, 1.0312, 8.1250, ..., -2.0586, 3.4727, 3.6172]],
# device='cuda:0')
# decode 4 tensor([[-0.0459, 1.0098, 9.1406, ..., -2.1797, 3.8320, 3.9355]],
# device='cuda:0')
# decode 5 tensor([[ 0.2964, 1.3564, 9.8828, ..., -2.1602, 4.1836, 4.2422]],
# device='cuda:0')
# decode 6 tensor([[ 0.6475, 1.8105, 10.1250, ..., -2.0098, 4.2578, 4.4062]],
# device='cuda:0')
# decode 7 tensor([[ 0.4985, 1.4746, 9.9062, ..., -1.9141, 3.9863, 4.3047]],
# device='cuda:0')
import multiprocessing
import time
import numpy as np
import torch
import torch.distributed as dist
from sglang.srt.hf_transformers_utils import get_processor
from sglang.srt.managers.router.infer_batch import ForwardMode
from sglang.srt.managers.router.model_runner import InputMetadata, ModelRunner
from sglang.srt.model_config import ModelConfig
from sglang.srt.utils import load_image
def init_batch_data(model, batch_size, input_len):
req_pool_indices = model.req_to_token_pool.alloc(batch_size)
seq_lens = torch.full((batch_size,), input_len, dtype=torch.int32, device="cuda")
prefix_lens = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
position_ids_offsets = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
out_cache_loc = model.token_to_kv_pool.alloc(batch_size * input_len)
for i in range(batch_size):
model.req_to_token_pool.req_to_token[i, :input_len] = out_cache_loc[
i * input_len : (i + 1) * input_len
]
return (
req_pool_indices,
seq_lens,
prefix_lens,
position_ids_offsets,
out_cache_loc,
)
def prefill(model, tp_rank, params, print_logits):
logits, _ = model.forward_extend_multi_modal(
*params,
False,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
if print_logits and tp_rank == 0:
print("prefill logits", logits, logits.shape)
return predict_ids
def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
(
req_pool_indices,
seq_lens,
prefix_lens,
position_ids_offsets,
out_cache_loc,
) = params
(
out_cache_loc,
out_cache_cont_start,
out_cache_cont_end,
) = model.token_to_kv_pool.alloc_contiguous(batch_size)
model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
seq_lens.add_(1)
logits = model.forward_decode(
torch.from_numpy(predict_ids).cuda().reshape(-1),
req_pool_indices,
seq_lens,
None,
position_ids_offsets,
None,
out_cache_cont_start,
out_cache_cont_end,
)
prob_out = torch.softmax(logits, dim=-1)
predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
predict_ids = predict_ids.detach().cpu().numpy()
if print_logits and tp_rank == 0:
print("decode", step, logits)
return predict_ids
def test_generate_worker(
model_path,
tp_rank,
tp_size,
):
model_config = ModelConfig(path=model_path)
model = ModelRunner(model_config, 0.8, tp_rank, tp_size, 28888)
# print(model.model)
# Prepare data
prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:"
image_path = "/home/ubuntu/sglang/test/lang/image.png"
image = load_image(image_path)
processor = get_processor("llava-hf/llava-1.5-7b-hf")
input_ids = processor.tokenizer.encode(prompt)
pixel_values = processor.image_processor(image)["pixel_values"]
input_ids, offset = model.model.pad_input_ids(
input_ids,
[
0,
],
)
params = init_batch_data(model, 1, len(input_ids))
# inference
output_ids = []
prefill_params = (
torch.tensor(np.array(input_ids)).cuda(),
np.array(pixel_values),
[offset],
*params,
)
predict_ids = prefill(model, tp_rank=0, params=prefill_params, print_logits=False)
output_ids.append(predict_ids[0][0])
for i in range(16):
predict_ids = decode(
i,
model,
tp_rank=0,
batch_size=1,
predict_ids=predict_ids,
params=params,
print_logits=False,
)
output_ids.append(predict_ids[0][0])
# detokenization
output = processor.tokenizer.batch_decode(
[output_ids], skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
assert (
output
== "The image features a man standing on the back of a yellow taxi cab, holding"
)
def test_generate(model_path, tp_size):
workers = []
for tp_rank in range(tp_size):
proc = multiprocessing.Process(
target=test_generate_worker,
args=(
model_path,
tp_rank,
tp_size,
),
)
proc.start()
workers.append(proc)
for proc in workers:
proc.join()
if __name__ == "__main__":
test_generate("liuhaotian/llava-v1.5-7b", 1)
import flashinfer
import pytest
import torch
from sglang.srt.layers.extend_attention import extend_attention_fwd
from sglang.srt.layers.token_attention import token_attention_fwd
@pytest.mark.parametrize("batch_size", [12, 37, 67])
@pytest.mark.parametrize("kv_len", [54, 97])
@pytest.mark.parametrize("qo_len", [37, 17])
@pytest.mark.parametrize("num_kv_heads", [4])
@pytest.mark.parametrize("num_qo_heads", [4, 32])
@pytest.mark.parametrize("head_dim", [128])
@pytest.mark.parametrize("use_wrapper", [True, False])
def test_batch_prefill_with_paged_kv_cache(
batch_size,
kv_len,
qo_len,
num_kv_heads,
num_qo_heads,
head_dim,
use_wrapper,
):
q = torch.randn(batch_size * qo_len, num_qo_heads, head_dim).to(0).half()
q_indptr = torch.arange(0, batch_size + 1).to(0).int() * qo_len
total_tokens = kv_len * batch_size
kv_data = torch.randn(total_tokens, 2, num_kv_heads, 1, head_dim).to(0).half()
kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
kv_indices = torch.arange(0, total_tokens).to(0).int()
kv_last_page_len = torch.full((batch_size,), 1, dtype=torch.int32).to(0)
# init args for triton kernel
k_extend = (
kv_data.view(batch_size, kv_len, 2, -1)[:, -qo_len:, 0]
.contiguous()
.view(-1, num_kv_heads, head_dim)
)
v_extend = (
kv_data.view(batch_size, kv_len, 2, -1)[:, -qo_len:, 1]
.contiguous()
.view(-1, num_kv_heads, head_dim)
)
o_triton = torch.empty_like(q)
k_buffer = kv_data[:, 0].view(-1, num_kv_heads, head_dim).contiguous()
v_buffer = kv_data[:, 1].view(-1, num_kv_heads, head_dim).contiguous()
req_to_token = torch.arange(0, total_tokens).to(0).int().view(batch_size, kv_len)
b_req_idx = torch.arange(0, batch_size).to(0).int()
b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32).to(0)
b_start_loc_extend = torch.arange(0, batch_size).to(0).int() * qo_len
b_seq_len_extend = torch.full((batch_size,), qo_len, dtype=torch.int32).to(0)
max_len_in_batch = kv_len
max_len_extend = qo_len
extend_attention_fwd(
q,
k_extend,
v_extend,
o_triton,
k_buffer,
v_buffer,
req_to_token,
b_req_idx,
None, # b_start_loc = None
b_seq_len,
None, # b_seq_len_prefix = None
b_start_loc_extend,
b_seq_len_extend,
max_len_in_batch,
max_len_extend,
)
if use_wrapper:
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper()
wrapper.begin_forward(q_indptr, batch_size, num_qo_heads, num_kv_heads)
o = wrapper.forward(
q, q_indptr, kv_data, kv_indptr, kv_indices, kv_last_page_len
)
else:
o = flashinfer.batch_prefill_with_paged_kv_cache(
q,
q_indptr,
kv_data,
kv_indptr,
kv_indices,
kv_last_page_len,
)
print("Mean: ", torch.mean(torch.abs(o - o_triton)))
print("Max: ", torch.max(torch.abs(o - o_triton)))
assert torch.allclose(o, o_triton, rtol=1e-2, atol=1e-3)
@pytest.mark.parametrize("batch_size", [12, 17, 37])
@pytest.mark.parametrize("kv_len", [54, 127, 537])
@pytest.mark.parametrize("num_kv_heads", [32])
@pytest.mark.parametrize("num_qo_heads", [32])
@pytest.mark.parametrize("head_dim", [128])
def test_batch_decode_with_paged_kv_cache(
batch_size,
kv_len,
num_kv_heads,
num_qo_heads,
head_dim,
):
# note(lsyin): when pytest, the number of heads cannot change, because triton kernel has a cache
# to test different shape of decode, change the parameters in the __main__, and run decode only once
q = torch.randn(batch_size, num_qo_heads, head_dim).to(0).half()
total_tokens = kv_len * batch_size
kv_data = torch.randn(total_tokens, 2, num_kv_heads, 1, head_dim).to(0).half()
kv_indptr = torch.arange(0, batch_size + 1).to(0).int() * kv_len
kv_indices = torch.arange(0, total_tokens).to(0).int()
kv_last_page_len = torch.full((batch_size,), 1, dtype=torch.int32).to(0)
# init args for triton kernel
k_buffer = kv_data[:, 0].view(-1, num_kv_heads, head_dim).contiguous()
v_buffer = kv_data[:, 1].view(-1, num_kv_heads, head_dim).contiguous()
o_triton = torch.empty_like(q)
req_to_token = (
torch.arange(0, kv_len * batch_size).to(0).int().view(batch_size, kv_len)
)
b_req_idx = torch.arange(0, batch_size).to(0).int()
b_start_loc = torch.arange(0, batch_size).to(0).int() * kv_len
b_seq_len = torch.full((batch_size,), kv_len, dtype=torch.int32).to(0)
max_len_in_batch = kv_len
other_kv_index = 0
token_attention_fwd(
q,
k_buffer,
v_buffer,
o_triton,
req_to_token,
b_req_idx,
b_start_loc,
b_seq_len,
max_len_in_batch,
other_kv_index,
total_tokens,
)
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper()
wrapper.begin_forward(
kv_indptr,
kv_last_page_len,
batch_size,
num_qo_heads,
num_kv_heads,
head_dim,
1,
"NONE",
"float16",
)
o = wrapper.forward(q, kv_data, kv_indptr, kv_indices, kv_last_page_len)
print("Mean: ", torch.mean(torch.abs(o - o_triton)))
print("Max: ", torch.max(torch.abs(o - o_triton)))
assert torch.allclose(o, o_triton, rtol=1e-2, atol=2e-3)
if __name__ == "__main__":
test_batch_prefill_with_paged_kv_cache(12, 54, 37, 8, 8, 128, False)
test_batch_prefill_with_paged_kv_cache(37, 1111, 456, 32, 32, 128, True)
test_batch_decode_with_paged_kv_cache(12, 54, 4, 32, 128)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment