Unverified Commit 22352d47 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)


Co-authored-by: default avatarKan Wu <wukanustc@gmail.com>
parent c5131f7a
......@@ -4,6 +4,7 @@ from typing import List
import numpy as np
import tqdm
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
from sglang.srt.managers.io_struct import GenerateReqInput
from sglang.srt.managers.tokenizer_manager import TokenizerManager
......@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
return decorator
async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
async def execute_warmups(
disaggregation_mode: str,
warmup_names: List[str],
tokenizer_manager: TokenizerManager,
):
for warmup_name in warmup_names:
if warmup_name not in _warmup_registry:
logger.warning(f"Could not find custom warmup {warmup_name}")
continue
logger.info(f"Running warmup {warmup_name}")
await _warmup_registry[warmup_name](tokenizer_manager)
await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
@warmup("voice_chat")
async def voice_chat(tokenizer_manager: TokenizerManager):
async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
# this warms up the fused_moe triton kernels and caches them
# if we don't do this we break real time inference for voice chat
for i in tqdm.trange(1, 512):
......@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
"min_p": 0.0,
},
)
if disaggregation_mode != "null":
generate_req_input.bootstrap_room = 0
generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
"""
Usage:
# replay from a folder
python3 replay_request_dump.py --file-number 100 --parallel 512 --input-folder /data/lianmin/sglang_request_dump/grok-mini-0220-engine-5756f8f94-28bm6/
# replay from a single file
python3 replay_request_dump.py --parallel 512 --input-file /data/sglang_crash_dump/memx-cti-34-sr1.xpop.twttr.net/crash_dump_2025-06-04_20-13-18.pkl
"""
import argparse
import glob
import json
import pickle
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import asdict
from datetime import datetime
import requests
from sglang.bench_serving import set_ulimit
from sglang.utils import get_exception_traceback
def read_records(files):
records = []
for f in files:
tmp = pickle.load(open(f, "rb"))
if isinstance(tmp, dict) and "requests" in tmp:
records.extend(tmp["requests"])
else:
records.extend(tmp)
return records
def run_one_request_internal(record):
(req, output, replay_init_time, start_time, end_time, idx) = record
time.sleep(max(0, start_time - (time.time() - replay_init_time)))
if "completion_tokens" in output.get("meta_info", {}):
recorded_completion_tokens = output["meta_info"]["completion_tokens"]
else:
recorded_completion_tokens = ""
json_data = asdict(req)
stream = json_data["stream"]
if args.ignore_eos:
json_data["sampling_params"]["ignore_eos"] = True
if recorded_completion_tokens:
json_data["sampling_params"]["max_new_tokens"] = recorded_completion_tokens
response = requests.post(
f"http://{args.host}:{args.port}/generate",
json=json_data,
stream=stream,
)
if stream:
for chunk in response.iter_lines(decode_unicode=False):
chunk = chunk.decode("utf-8")
if chunk and chunk.startswith("data:"):
if chunk == "data: [DONE]":
break
ret = json.loads(chunk[5:].strip("\n"))
else:
ret = response.json()
prompt_tokens = ret["meta_info"]["prompt_tokens"]
completion_tokens = ret["meta_info"]["completion_tokens"]
print(
f"{idx=}, {start_time=:.2f}, {prompt_tokens=}, "
f"{completion_tokens=}, {recorded_completion_tokens=}"
)
def run_one_request(record):
# global success_ct, error_ct
try:
run_one_request_internal(record)
# success_ct += 1
except Exception:
# error_ct += 1
traceback = get_exception_traceback()
print(f"Hit an exception: {traceback}")
def main(records):
if len(records) == 0:
return
base_time = records[0][-2]
base_time_str = datetime.fromtimestamp(base_time).strftime("%y-%m-%d %H:%M:%S")
print(f"{base_time_str=}")
replay_init_time = time.time()
for i in range(len(records)):
req, output, start_time, end_time = records[i]
start_time -= base_time
records[i] = (req, output, replay_init_time, start_time, end_time, i)
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(run_one_request, records)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=30000)
parser.add_argument(
"--input-folder", type=str, default=None, help="Folder containing pickle files"
)
parser.add_argument(
"--input-file", type=str, default=None, help="Single pickle file to process"
)
parser.add_argument("--file-number", type=int, default=1)
parser.add_argument("--req-number", type=int, default=1000000)
parser.add_argument("--req-start", type=int, default=0)
parser.add_argument("--parallel", type=int, default=512)
parser.add_argument("--idx", type=int, default=None)
parser.add_argument("--ignore-eos", action="store_true")
args = parser.parse_args()
set_ulimit()
files = []
if args.input_file:
files = [args.input_file]
if args.file_number > 1:
print("Warning: --file-number is ignored when --input-file is provided.")
elif args.input_folder:
files = glob.glob(f"{args.input_folder}/*.pkl")
files = files[: args.file_number]
else:
print("Error: Either --input-folder or --input-file must be provided.")
exit(1)
print(f"{files=}")
records = read_records(files)
# Sort by the receive time, before filtering
records.sort(key=lambda x: x[-2])
records = records[args.req_start :]
if args.idx:
records = [records[args.idx]]
print(f"testing {args.idx=}")
print(f"{records[0]}")
print(f"{len(records)=}")
main(records)
......@@ -173,10 +173,11 @@ suites = {
# TestFile("test_deepep_intranode.py", 50),
# TestFile("test_deepep_low_latency.py", 50),
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
# Disabled because it hangs on the CI.
# TestFile("test_moe_ep.py", 181),
TestFile("test_disaggregation.py", 270),
TestFile("test_disaggregation_different_tp.py", 155),
TestFile("test_full_deepseek_v3.py", 463),
TestFile("test_moe_ep.py", 181),
],
"per-commit-8-gpu-amd": [
TestFile("test_full_deepseek_v3.py", 250),
......
......@@ -178,7 +178,7 @@ class TestVisionChunkedPrefill(CustomTestCase):
print(output_chunked)
print("output without chunked prefill:")
print(output_no_chunked)
assert output_chunked == output_no_chunked
self.assertEqual(output_chunked, output_no_chunked)
def test_chunked_prefill(self):
self._test_chunked_prefill(batches=[False, True], num_frames=[1, [2, 6, 8, 10]])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment