Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)

Co-authored-by: Kan Wu <wukanustc@gmail.com>

Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)
Co-authored-by: Kan Wu <wukanustc@gmail.com>
22352d47 · Lianmin Zheng · GitHub · c5131f7a · 22352d47 · 22352d47
Unverified Commit 22352d47 authored Jun 29, 2025 by Lianmin Zheng Committed by GitHub Jun 29, 2025
4 changed files
--- a/python/sglang/srt/warmup.py
+++ b/python/sglang/srt/warmup.py
@@ -4,6 +4,7 @@ from typing import List
 import numpy as np
 import tqdm

+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
 from sglang.srt.managers.io_struct import GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager

@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
    return decorator


-async def execute_warmups(warmup_names: List[str], tokenizer_manager: TokenizerManager):
+async def execute_warmups(
+    disaggregation_mode: str,
+    warmup_names: List[str],
+    tokenizer_manager: TokenizerManager,
+):
    for warmup_name in warmup_names:
        if warmup_name not in _warmup_registry:
            logger.warning(f"Could not find custom warmup {warmup_name}")
            continue
        logger.info(f"Running warmup {warmup_name}")
-        await _warmup_registry[warmup_name](tokenizer_manager)
+        await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)


 @warmup("voice_chat")
-async def voice_chat(tokenizer_manager: TokenizerManager):
+async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
    # this warms up the fused_moe triton kernels and caches them
    # if we don't do this we break real time inference for voice chat
    for i in tqdm.trange(1, 512):
@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
                "min_p": 0.0,
            },
        )
+        if disaggregation_mode != "null":
+            generate_req_input.bootstrap_room = 0
+            generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
+
        await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
--- a/scripts/playground/replay_request_dump.py
+++ b/scripts/playground/replay_request_dump.py
+"""
+Usage:
+# replay from a folder
+python3 replay_request_dump.py --file-number 100 --parallel 512 --input-folder /data/lianmin/sglang_request_dump/grok-mini-0220-engine-5756f8f94-28bm6/
+
+# replay from a single file
+python3 replay_request_dump.py --parallel 512 --input-file /data/sglang_crash_dump/memx-cti-34-sr1.xpop.twttr.net/crash_dump_2025-06-04_20-13-18.pkl
+"""
+
+import argparse
+import glob
+import json
+import pickle
+import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import asdict
+from datetime import datetime
+
+import requests
+
+from sglang.bench_serving import set_ulimit
+from sglang.utils import get_exception_traceback
+
+
+def read_records(files):
+    records = []
+    for f in files:
+        tmp = pickle.load(open(f, "rb"))
+        if isinstance(tmp, dict) and "requests" in tmp:
+            records.extend(tmp["requests"])
+        else:
+            records.extend(tmp)
+
+    return records
+
+
+def run_one_request_internal(record):
+    (req, output, replay_init_time, start_time, end_time, idx) = record
+    time.sleep(max(0, start_time - (time.time() - replay_init_time)))
+
+    if "completion_tokens" in output.get("meta_info", {}):
+        recorded_completion_tokens = output["meta_info"]["completion_tokens"]
+    else:
+        recorded_completion_tokens = ""
+
+    json_data = asdict(req)
+    stream = json_data["stream"]
+
+    if args.ignore_eos:
+        json_data["sampling_params"]["ignore_eos"] = True
+        if recorded_completion_tokens:
+            json_data["sampling_params"]["max_new_tokens"] = recorded_completion_tokens
+
+    response = requests.post(
+        f"http://{args.host}:{args.port}/generate",
+        json=json_data,
+        stream=stream,
+    )
+
+    if stream:
+        for chunk in response.iter_lines(decode_unicode=False):
+            chunk = chunk.decode("utf-8")
+            if chunk and chunk.startswith("data:"):
+                if chunk == "data: [DONE]":
+                    break
+                ret = json.loads(chunk[5:].strip("\n"))
+    else:
+        ret = response.json()
+
+    prompt_tokens = ret["meta_info"]["prompt_tokens"]
+    completion_tokens = ret["meta_info"]["completion_tokens"]
+    print(
+        f"{idx=}, {start_time=:.2f}, {prompt_tokens=}, "
+        f"{completion_tokens=}, {recorded_completion_tokens=}"
+    )
+
+
+def run_one_request(record):
+    # global success_ct, error_ct
+
+    try:
+        run_one_request_internal(record)
+        # success_ct += 1
+    except Exception:
+        # error_ct += 1
+        traceback = get_exception_traceback()
+        print(f"Hit an exception: {traceback}")
+
+
+def main(records):
+    if len(records) == 0:
+        return
+
+    base_time = records[0][-2]
+    base_time_str = datetime.fromtimestamp(base_time).strftime("%y-%m-%d %H:%M:%S")
+    print(f"{base_time_str=}")
+    replay_init_time = time.time()
+
+    for i in range(len(records)):
+        req, output, start_time, end_time = records[i]
+        start_time -= base_time
+        records[i] = (req, output, replay_init_time, start_time, end_time, i)
+
+    with ThreadPoolExecutor(args.parallel) as executor:
+        executor.map(run_one_request, records)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=30000)
+    parser.add_argument(
+        "--input-folder", type=str, default=None, help="Folder containing pickle files"
+    )
+    parser.add_argument(
+        "--input-file", type=str, default=None, help="Single pickle file to process"
+    )
+    parser.add_argument("--file-number", type=int, default=1)
+    parser.add_argument("--req-number", type=int, default=1000000)
+    parser.add_argument("--req-start", type=int, default=0)
+    parser.add_argument("--parallel", type=int, default=512)
+    parser.add_argument("--idx", type=int, default=None)
+    parser.add_argument("--ignore-eos", action="store_true")
+    args = parser.parse_args()
+
+    set_ulimit()
+
+    files = []
+    if args.input_file:
+        files = [args.input_file]
+        if args.file_number > 1:
+            print("Warning: --file-number is ignored when --input-file is provided.")
+    elif args.input_folder:
+        files = glob.glob(f"{args.input_folder}/*.pkl")
+        files = files[: args.file_number]
+    else:
+        print("Error: Either --input-folder or --input-file must be provided.")
+        exit(1)
+    print(f"{files=}")
+
+    records = read_records(files)
+    # Sort by the receive time, before filtering
+    records.sort(key=lambda x: x[-2])
+    records = records[args.req_start :]
+    if args.idx:
+        records = [records[args.idx]]
+        print(f"testing {args.idx=}")
+        print(f"{records[0]}")
+    print(f"{len(records)=}")
+    main(records)
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -173,10 +173,11 @@ suites = {
        # TestFile("test_deepep_intranode.py", 50),
        # TestFile("test_deepep_low_latency.py", 50),
        # TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
+        # Disabled because it hangs on the CI.
+        # TestFile("test_moe_ep.py", 181),
        TestFile("test_disaggregation.py", 270),
        TestFile("test_disaggregation_different_tp.py", 155),
        TestFile("test_full_deepseek_v3.py", 463),
-        TestFile("test_moe_ep.py", 181),
    ],
    "per-commit-8-gpu-amd": [
        TestFile("test_full_deepseek_v3.py", 250),

--- a/test/srt/test_vision_chunked_prefill.py
+++ b/test/srt/test_vision_chunked_prefill.py
@@ -178,7 +178,7 @@ class TestVisionChunkedPrefill(CustomTestCase):
            print(output_chunked)
            print("output without chunked prefill:")
            print(output_no_chunked)
-            assert output_chunked == output_no_chunked
+            self.assertEqual(output_chunked, output_no_chunked)

    def test_chunked_prefill(self):
        self._test_chunked_prefill(batches=[False, True], num_frames=[1, [2, 6, 8, 10]])