[minor] sync code on python/sglang/test/test_deterministic.py and improve ci tests (#11777)

Co-authored-by: Stefan He <hebiaobuaa@gmail.com> Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>

[minor] sync code on python/sglang/test/test_deterministic.py and improve ci tests (#11777)
Co-authored-by: Stefan He <hebiaobuaa@gmail.com> Co-authored-by: Byron Hsu <byronhsu1230@gmail.com>
b9a54e09 · Lianmin Zheng · GitHub · 20b8d230 · b9a54e09 · b9a54e09
Unverified Commit b9a54e09 authored Oct 17, 2025 by Lianmin Zheng Committed by GitHub Oct 17, 2025
9 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -319,7 +319,7 @@ jobs:
          cd test/srt
          python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
-  unit-test-backend-8-gpu:
+  unit-test-backend-8-gpu-h200:
    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
    if: always() && !failure() && !cancelled() &&
        ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
@@ -348,7 +348,7 @@ jobs:
        timeout-minutes: 20
        run: |
          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+          python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
  unit-test-backend-8-gpu-h20:
    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
@@ -695,7 +695,7 @@ jobs:
        timeout-minutes: 20
        run: |
          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu-deepep
+          python3 run_suite.py --suite per-commit-8-gpu-h200-deepep
  unit-test-backend-8-gpu-deepseek-v32:
    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
@@ -722,7 +722,7 @@ jobs:
        timeout-minutes: 20
        run: |
          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu-deepseek-v32
+          python3 run_suite.py --suite per-commit-8-gpu-h200-deepseek-v32
  unit-test-backend-4-gpu-b200:
    needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
@@ -761,12 +761,12 @@ jobs:
      sgl-kernel-unit-test, sgl-kernel-mla-test, sgl-kernel-benchmark-test,
      unit-test-frontend, unit-test-backend-1-gpu,
-      unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
+      unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu-h200,
      performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-1-gpu-part-3,
      performance-test-2-gpu,
      accuracy-test-1-gpu, accuracy-test-2-gpu,
      unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
-      # unit-test-backend-4-gpu-b200,
+      unit-test-backend-4-gpu-b200,
    ]
    if: always()
    runs-on: ubuntu-latest

--- a/docs/developer_guide/benchmark_and_profiling.md
+++ b/docs/developer_guide/benchmark_and_profiling.md
@@ -116,6 +116,12 @@ python3 -m sglang.test.send_one
 python3 -m sglang.profiler
 ```
+You can also combine the above operations into a single command
+```
+python3 -m sglang.test.send_one --profile
+```
 ### Profiler Trace Merger for Distributed Traces
 SGLang now supports automatic merging of profiling traces from distributed setups with multiple parallelism types (TP, DP, PP, EP). This feature is particularly useful for analyzing performance across distributed runs.

--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -879,6 +879,8 @@ class BatchMultimodalDecodeReq(BaseBatchReq):
    placeholder_tokens_idx: List[Optional[List[int]]]
    placeholder_tokens_val: List[Optional[List[int]]]
+    return_bytes: List[bool]
    # The trainer step id. Used to know which step's weights are used for sampling.
    token_steps: List[List[int]] = None

--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -150,6 +150,9 @@ class SchedulerStats:
    engine_startup_time: float = 0.0
    engine_load_weights_time: float = 0.0
+    # CUDA graph
+    is_cuda_graph: float = 0.0
 class SchedulerMetricsCollector:
@@ -499,6 +502,13 @@ class SchedulerMetricsCollector:
            labelnames=list(labels.keys()) + ["stage"],
        )
+        self.is_cuda_graph = Gauge(
+            name="sglang:is_cuda_graph",
+            documentation="Whether the batch is using CUDA graph.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
        # Convenience function for logging to gauge.
        gauge.labels(**self.labels).set(data)
@@ -574,6 +584,9 @@ class SchedulerMetricsCollector:
                self.engine_load_weights_time, stats.engine_load_weights_time
            )
+        # CUDA graph
+        self._log_gauge(self.is_cuda_graph, stats.is_cuda_graph)
        self.last_log_time = time.perf_counter()
    def log_grammar_stats(self, grammar_stats) -> None:

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -509,6 +509,11 @@ class ServerArgs:
        """
        Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
        """
+        if self.model_path.lower() in ["none", "dummy"]:
+            # Skip for dummy models
+            return
        # Handle deprecated arguments.
        self._handle_deprecated_args()

--- a/python/sglang/test/attention/test_flashattn_backend.py
+++ b/python/sglang/test/attention/test_flashattn_backend.py
@@ -66,7 +66,7 @@ class MockModelRunner:
            enable_memory_saver=False,
        )
        # Required by torch native backend
-        self.server_args = ServerArgs(model_path="fake_model_path")
+        self.server_args = ServerArgs(model_path="dummy")
 @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")

--- a/python/sglang/test/test_deterministic.py
+++ b/python/sglang/test/test_deterministic.py
@@ -2,7 +2,17 @@
 Batch the same prompt in random batch sizes, and test if the results are consistent across different trials.
 Usage:
-python3 -m sglang.test.test_deterministic --n-trials <numer_of_trials> --test-mode <single|mixed|prefix> --profile
+# Single mode: test determinism with varying batch sizes
+python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode single
+# Mixed mode: test with mixed prompts
+python3 -m sglang.test.test_deterministic --n-trials 50 --test-mode mixed
+# Prefix mode: test with shared prefixes
+python3 -m sglang.test.test_deterministic --n-start 1 --n-trials 50 --test-mode prefix
+# Radix Cache Consistency mode: test radix cache determinism (cached vs uncached prefill)
+python3 -m sglang.test.test_deterministic --test-mode radix_cache
 """
 import argparse
@@ -67,7 +77,12 @@ class BenchArgs:
            "--test-mode",
            type=str,
            default=BenchArgs.test_mode,
-            choices=["single", "mixed", "prefix"],
+            choices=[
+                "single",
+                "mixed",
+                "prefix",
+                "radix_cache",
+            ],
        )
        parser.add_argument("--profile", action="store_true")
        parser.add_argument(
@@ -83,26 +98,50 @@ class BenchArgs:
 def send_single(
    args,
-    batch_size: int,
+    batch_size: int = 1,
    profile: bool = False,
    profile_steps: int = 3,
    profile_by_stage: bool = False,
+    return_full_response: bool = False,
+    input_ids: List[int] = None,
+    max_new_tokens: int = None,
 ):
    base_url = f"http://{args.host}:{args.port}"
-    prompt = [PROMPT_1] * batch_size
-    json_data = {
+    # Use input_ids if provided, otherwise use text prompts
-        "text": prompt,
+    if input_ids is not None:
-        "sampling_params": {
+        json_data = {
-            "temperature": args.temperature,
+            "input_ids": input_ids,
-            "max_new_tokens": args.max_new_tokens,
+            "sampling_params": {
-            "frequency_penalty": args.frequency_penalty,
+                "temperature": args.temperature,
-            "presence_penalty": args.presence_penalty,
+                "max_new_tokens": (
-        },
+                    max_new_tokens
-        "return_logprob": args.return_logprob,
+                    if max_new_tokens is not None
-        "stream": args.stream,
+                    else args.max_new_tokens
-    }
+                ),
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+            },
+            "return_logprob": args.return_logprob,
+            "stream": args.stream,
+        }
+    else:
+        prompt = [PROMPT_1] * batch_size
+        json_data = {
+            "text": prompt,
+            "sampling_params": {
+                "temperature": args.temperature,
+                "max_new_tokens": (
+                    max_new_tokens
+                    if max_new_tokens is not None
+                    else args.max_new_tokens
+                ),
+                "frequency_penalty": args.frequency_penalty,
+                "presence_penalty": args.presence_penalty,
+            },
+            "return_logprob": args.return_logprob,
+            "stream": args.stream,
+        }
    if args.sampling_seed is not None:
        # sglang server cannot parse None value for sampling_seed
@@ -119,6 +158,11 @@ def send_single(
        stream=args.stream,
    )
+    if response.status_code != 200:
+        ret = response.json()
+        print(f"Error: {ret}")
+        return None
    if args.stream:
        for chunk in response.iter_lines(decode_unicode=False):
            chunk = chunk.decode("utf-8")
@@ -128,13 +172,13 @@ def send_single(
                ret = json.loads(chunk[5:].strip("\n"))
    else:
        ret = response.json()
-    ret = ret[0]
-    if response.status_code != 200:
+    ret = ret[0] if isinstance(ret, list) else ret
-        print(ret)
-        return -1
-    return ret["text"]
+    if return_full_response:
+        return ret
+    else:
+        return ret["text"]
 def send_mixed(args, batch_size: int):
@@ -235,7 +279,6 @@ def test_deterministic(args):
            text = text.replace("\n", " ")
            print(f"Trial {i} with batch size {batch_size}: {text}")
            texts.append(text)
        print(f"Total samples: {len(texts)}, Unique samples: {len(set(texts))}")
        return [len(set(texts))]
@@ -297,6 +340,163 @@ def test_deterministic(args):
            results.append(len(set(outputs[i])))
        return results
+    elif args.test_mode == "radix_cache":
+        # Radix mode requires logprobs to compare results
+        args.return_logprob = True
+        print("\n=== Prefill Cache Consistency Test ===")
+        print(
+            "This test verifies prefill request produces consistent logprobs w/ and w/o cache.\n"
+        )
+        # We noticed that we cannot call flush cache before any request, otherwise it will hang.
+        warmup_response = send_single(
+            args, input_ids=[1] * 64, max_new_tokens=65, return_full_response=True
+        )
+        # Flush cache first to make sure there is no cache hit from previous tests
+        flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
+        print(f"Step 1: Generating random 64 token IDs...")
+        # Use a reasonable token ID range (e.g., 1-50000 for most tokenizers)
+        # Avoid special tokens like 0 (padding), 1 (BOS), 2 (EOS)
+        # set seed for random.randint
+        random.seed(42)
+        initial_token_ids = [random.randint(100, 50000) for _ in range(64)]
+        print(f"✓ Using {len(initial_token_ids)} initial tokens")
+        print(f"  Initial token IDs: {initial_token_ids}")
+        print(
+            f"\nStep 2: Generating 2 tokens from {len(initial_token_ids)} token prefix..."
+        )
+        first_response = send_single(
+            args,
+            input_ids=initial_token_ids,
+            max_new_tokens=100,
+            return_full_response=True,
+        )
+        first_output_text = first_response["text"]
+        first_output_token_ids = first_response["output_ids"]
+        first_output_logprobs = first_response["meta_info"]["output_token_logprobs"]
+        expected_token_id = first_output_token_ids[-1]
+        expected_logprob = first_output_logprobs[-1][0]
+        print(f"✓ Generated {len(first_output_token_ids)} tokens")
+        print(f'  Output text: "{first_output_text}"')
+        print(
+            f"\nStep 3: Generating with radix cache (164 tokens prefill, should hit > 128 tokens cache, based on page size)..."
+        )
+        prefix_token_ids = initial_token_ids + first_output_token_ids[:-1]
+        print(
+            f"  Prefix: {len(initial_token_ids)} initial + 64 generated = {len(prefix_token_ids)} tokens"
+        )
+        print(f"Using Prompt: {prefix_token_ids}")
+        cached_response = send_single(
+            args,
+            input_ids=prefix_token_ids,
+            max_new_tokens=1,
+            return_full_response=True,
+        )
+        cached_logprobs = cached_response["meta_info"]["output_token_logprobs"]
+        cached_token_data = cached_logprobs[0]
+        cached_logprob = cached_token_data[0]
+        cached_token_id = cached_token_data[1]
+        print(f"✓ Generated with cache:")
+        print(f"  Token ID: {cached_token_id}")
+        print(f"  Logprob:  {cached_logprob:.10f}")
+        print(f"\nStep 4: Flushing cache...")
+        flush_response = requests.post(f"http://{args.host}:{args.port}/flush_cache")
+        print(
+            f"\nStep 5: Generating without cache (same 164 tokens prefill, no cache)..."
+        )
+        print(f"Using Prompt: {prefix_token_ids}")
+        uncached_response = send_single(
+            args,
+            input_ids=prefix_token_ids,
+            max_new_tokens=1,
+            return_full_response=True,
+        )
+        uncached_logprobs = uncached_response["meta_info"]["output_token_logprobs"]
+        uncached_token_data = uncached_logprobs[0]
+        uncached_logprob = uncached_token_data[0]
+        uncached_token_id = uncached_token_data[1]
+        print(f"✓ Generated without cache:")
+        print(f"  Token ID: {uncached_token_id}")
+        print(f"  Logprob:  {uncached_logprob:.10f}")
+        # Step 6: Compare results
+        print(f"\n{'='*60}")
+        print("Comparison 1: Decode (Request 1) vs Prefill with Cache (Request 2)")
+        print("=" * 60)
+        # Compare first request (decode) vs second request (prefill with cache)
+        # We expect them to be different (different kernels)
+        decode_vs_prefill_token_match = expected_token_id == cached_token_id
+        decode_vs_prefill_logprob_match = expected_logprob == cached_logprob
+        print(
+            f"  Decode token (Request 1):          ID={expected_token_id}, logprob={expected_logprob:.10f}"
+        )
+        print(
+            f"  Prefill w/ cache token (Request 2): ID={cached_token_id}, logprob={cached_logprob:.10f}"
+        )
+        print(
+            f"  Token ID match: {'✓ YES' if decode_vs_prefill_token_match else '✗ NO'}"
+        )
+        print(
+            f"  Logprob match:  {'✓ YES' if decode_vs_prefill_logprob_match else '✗ NO'}"
+        )
+        if not decode_vs_prefill_logprob_match:
+            diff = abs(expected_logprob - cached_logprob)
+            print(f"  Logprob difference: {diff:.10e}")
+        print(f"  Note: We expect these to be DIFFERENT (decode vs prefill kernels)")
+        print(f"\n{'='*60}")
+        print(
+            "Comparison 2: Cached Prefill (Request 2) vs Uncached Prefill (Request 3)"
+        )
+        print("=" * 60)
+        # Main test: compare cached vs uncached prefill (should be identical)
+        token_match = cached_token_id == uncached_token_id
+        logprob_match = cached_logprob == uncached_logprob
+        print(
+            f"  Cached prefill token (Request 2):   ID={cached_token_id}, logprob={cached_logprob:.10f}"
+        )
+        print(
+            f"  Uncached prefill token (Request 3): ID={uncached_token_id}, logprob={uncached_logprob:.10f}"
+        )
+        print(f"  Token ID match: {'✓ YES' if token_match else '✗ NO'}")
+        if not token_match:
+            print(f"    Cached:   {cached_token_id}")
+            print(f"    Uncached: {uncached_token_id}")
+        print(f"  Logprob match:  {'✓ YES' if logprob_match else '✗ NO'}")
+        if not logprob_match:
+            print(f"    Cached:   {cached_logprob:.10f}")
+            print(f"    Uncached: {uncached_logprob:.10f}")
+            diff = abs(cached_logprob - uncached_logprob)
+            print(f"    Difference: {diff:.10e}")
+        print(f"  Note: We expect these to be IDENTICAL (both prefill kernels)")
+        print(f"\n{'='*60}")
+        if token_match and logprob_match:
+            print("✓✓✓ TEST PASSED - Radix cache is consistent! ✓✓✓")
+            return [1]
+        else:
+            print("✗✗✗ TEST FAILED - Radix cache produces different results! ✗✗✗")
+            return [0]
    else:
        raise ValueError(f"Invalid test mode: {args.test_mode}")

--- a/test/srt/rl/test_fp32_lm_head.py
+++ b/test/srt/rl/test_fp32_lm_head.py
@@ -36,7 +36,6 @@ class TestLMHeadFP32(unittest.TestCase):
            raise unittest.SkipTest("needs CUDA GPU")
    def _make_logprocessor(self, vocab_size, enable_fp32):
-        ServerArgs.__post_init__ = lambda self: None  # disable validation
        set_global_server_args_for_scheduler(ServerArgs(model_path="dummy"))
        get_global_server_args().enable_dp_lm_head = False
        get_global_server_args().enable_fp32_lm_head = enable_fp32

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -66,10 +66,10 @@ suites = {
        TestFile("rl/test_update_weights_from_disk.py", 114),
        TestFile("rl/test_update_weights_from_tensor.py", 48),
        TestFile("test_abort.py", 51),
+        TestFile("test_build_eagle_tree.py", 8),
        TestFile("test_chunked_prefill.py", 313),
        TestFile("test_create_kvindices.py", 2),
        TestFile("test_deterministic.py", 300),
-        TestFile("test_build_eagle_tree.py", 8),
        TestFile("test_eagle_infer_a.py", 370),
        TestFile("test_eagle_infer_b.py", 700),
        TestFile("test_eagle_infer_beta.py", 300),
@@ -158,12 +158,17 @@ suites = {
        TestFile("test_multi_instance_release_memory_occupation.py", 64),
        TestFile("test_pp_single_node.py", 481),
    ],
-    "per-commit-8-gpu": [
+    "per-commit-8-gpu-h200": [
        TestFile("lora/test_lora_llama4.py", 400),
        TestFile("test_deepseek_v3_basic.py", 275),
        TestFile("test_deepseek_v3_mtp.py", 275),
        TestFile("test_disaggregation_hybrid_attention.py", 200),
    ],
+    "per-commit-8-gpu-h20": [
+        TestFile("quant/test_w4a8_deepseek_v3.py", 371),
+        TestFile("test_disaggregation_different_tp.py", 600),
+        TestFile("test_disaggregation_pp.py", 140),
+    ],
    "per-commit-4-gpu-b200": [
        # TestFile("test_gpt_oss_4gpu.py", 600),
        # TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
@@ -172,17 +177,12 @@ suites = {
        TestFile("ep/test_deepep_small.py", 531),
        TestFile("ep/test_mooncake_ep_small.py", 450),
    ],
-    "per-commit-8-gpu-deepep": [
+    "per-commit-8-gpu-h200-deepep": [
        TestFile("ep/test_deepep_large.py", 338),
    ],
-    "per-commit-8-gpu-deepseek-v32": [
+    "per-commit-8-gpu-h200-deepseek-v32": [
        TestFile("test_deepseek_v32_basic.py", 275),
    ],
-    "per-commit-8-gpu-h20": [
-        TestFile("test_disaggregation_different_tp.py", 600),
-        TestFile("test_disaggregation_pp.py", 140),
-        TestFile("quant/test_w4a8_deepseek_v3.py", 371),
-    ],
    "vllm_dependency_test": [
        TestFile("quant/test_awq.py", 163),
        TestFile("test_bnb.py", 5),