minor code sync (#12403)

6a63a985 · Lianmin Zheng · GitHub · 4d2f17bd · 6a63a985 · 6a63a985
Unverified Commit 6a63a985 authored Oct 30, 2025 by Lianmin Zheng Committed by GitHub Oct 30, 2025
4 changed files
--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -1473,6 +1473,16 @@ class WatchLoadUpdateReq(BaseReq):
    loads: List[GetLoadReqOutput]


+@dataclass
+class SetInjectDumpMetadataReqInput(BaseReq):
+    dump_metadata: Dict[str, Any]
+
+
+@dataclass
+class SetInjectDumpMetadataReqOutput(BaseReq):
+    success: bool
+
+
 @dataclass
 class LazyDumpTensorsReqInput(BaseReq):
    pass
@@ -1504,6 +1514,3 @@ def _check_all_req_types():
            raise ValueError(
                f"{name} is a subclass of BaseReq but not follow the naming convention."
            )
-
-
-_check_all_req_types()
--- a/python/sglang/srt/metrics/collector.py
+++ b/python/sglang/srt/metrics/collector.py
@@ -870,13 +870,13 @@ class TokenizerMetricsCollector:
    def check_time_to_first_token_straggler(self, value: float) -> bool:
        his = self.histogram_time_to_first_token.labels(**self.labels)
        total_observations = sum(bucket._value for bucket in his._buckets)
-        if total_observations < 1000:
+        if total_observations < 100:
            return False
-        p999_threshold = total_observations * 0.999
+        p99_threshold = total_observations * 0.99
        cumulative_count = 0
        for i, bucket in enumerate(his._buckets):
            cumulative_count += bucket._value
-            if cumulative_count > p999_threshold:
+            if cumulative_count > p99_threshold:
                return value >= his._upper_bounds[i]
        return False


--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -135,6 +135,8 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]

 DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]

+RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND = ["fa3", "triton"]
+
 DEFAULT_LORA_EVICTION_POLICY = "lru"

 NSA_CHOICES = [
@@ -190,6 +192,10 @@ def add_deterministic_attention_backend_choices(choices):
    DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)


+def add_radix_supported_deterministic_attention_backend_choices(choices):
+    RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND.extend(choices)
+
+
 def add_radix_eviction_policy_choices(choices):
    RADIX_EVICTION_POLICY_CHOICES.extend(choices)

@@ -1753,13 +1759,17 @@ class ServerArgs:
                    f"but you explicitly specified '{self.attention_backend}'."
                )

-            if self.attention_backend not in ["fa3", "triton"]:
-                if is_deepseek_model:
+            if is_deepseek_model:
+                if self.attention_backend not in ["fa3", "triton"]:
                    raise ValueError(
-                        f"Currently only fa3 and triton attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
+                        f"Currently only {RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND} attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
                    )

-                # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
+            if (
+                self.attention_backend
+                not in RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND
+            ):
+                # Currently, only certain backends support radix cache. Support for other backends is in progress
                self.disable_radix_cache = True
                logger.warning(
                    f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."

--- a/scripts/ci/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -21,7 +21,7 @@ python3 -c 'import os, shutil, tempfile, getpass; cache_dir = os.environ.get("TO
 rm -rf /root/.cache/flashinfer

 # Install apt packages
-apt install -y git libnuma-dev
+apt install -y git libnuma-dev libssl-dev pkg-config

 # Install protoc for router build (gRPC protobuf compilation)
 if ! command -v protoc &> /dev/null; then
@@ -54,10 +54,7 @@ if [ "$IS_BLACKWELL" = "1" ]; then
    PIP_INSTALL_SUFFIX="--break-system-packages"

    # Clean up existing installations
-    $PIP_CMD uninstall -y flashinfer_python sgl-kernel sglang vllm $PIP_INSTALL_SUFFIX || true
-
-    # Install the main package
-    $PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX --force-reinstall
+    $PIP_CMD uninstall -y sgl-kernel sglang $PIP_INSTALL_SUFFIX || true
 else
    # In normal cases, we use uv, which is much faster than pip.
    pip install --upgrade pip
@@ -68,20 +65,11 @@ else
    PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match"

    # Clean up existing installations
-    $PIP_CMD uninstall flashinfer_python sgl-kernel sglang vllm || true
-
-    # Install the main package without deps
-    $PIP_CMD install -e "python[dev]" --no-deps $PIP_INSTALL_SUFFIX --force-reinstall
-
-    # Install flashinfer-python 0.4.1 dependency that requires prerelease (This should be removed when flashinfer fixes this issue)
-    $PIP_CMD install flashinfer-python==0.4.1 --prerelease=allow $PIP_INSTALL_SUFFIX
-
-    # Install the main package
-    $PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX --upgrade
+    $PIP_CMD uninstall sgl-kernel sglang || true
 fi

-# Install OpenSSL development libraries for router build
-apt install -y libssl-dev pkg-config
+# Install the main package
+$PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX

 # Install router for pd-disagg test
 $PIP_CMD install -e "sgl-router" $PIP_INSTALL_SUFFIX