Clean up some Qwen3-Next and deterministic code (#11585)

6b143d62 · Stefan He · GitHub · 6bc503af · 6b143d62 · 6b143d62
Unverified Commit 6b143d62 authored Oct 15, 2025 by Stefan He Committed by GitHub Oct 15, 2025
4 changed files
--- a/python/sglang/srt/configs/mamba_utils.py
+++ b/python/sglang/srt/configs/mamba_utils.py
@@ -70,7 +70,7 @@ class Mamba2StateShape:

        # These are not TP-ed as they depend on A, dt_bias, D
        # - they are typically small
-        #   e.g., (h_heads, head_dim, state_size) = (128, 64, 128)
+        #   e.g., QWen3-Next: (32, 128, 128)
        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
        return Mamba2StateShape(
            conv=conv_state_shape,

--- a/python/sglang/srt/configs/qwen3_next.py
+++ b/python/sglang/srt/configs/qwen3_next.py
@@ -27,12 +27,9 @@ from sglang.srt.layers.dp_attention import get_attention_tp_size
 logger = logging.get_logger(__name__)


-# NOTE: HybridLayerType
 class HybridLayerType(enum.Enum):
    full_attention = "attention"
-    swa_attention = "swa_attention"
    linear_attention = "linear_attention"
-    mamba2 = "mamba"


 class Qwen3NextConfig(PretrainedConfig):

--- a/python/sglang/srt/models/falcon_h1.py
+++ b/python/sglang/srt/models/falcon_h1.py
@@ -450,13 +450,6 @@ class FalconH1Model(nn.Module):
        return hidden_states


-class HybridLayerType(enum.Enum):
-    full_attention = "attention"
-    swa_attention = "swa_attention"
-    linear_attention = "linear_attention"
-    mamba2 = "mamba"
-
-
 class FalconH1ForCausalLM(nn.Module):
    fall_back_to_pt_during_load = False


--- a/python/sglang/test/test_deterministic.py
+++ b/python/sglang/test/test_deterministic.py
@@ -226,10 +226,6 @@ def send_prefix(args, batch_size: int, prompts: List[str]):


 def test_deterministic(args):
-    # First do some warmups
-    for i in range(3):
-        send_single(args, 16, args.profile)
-
    if args.test_mode == "single":
        # In single mode, we test the deterministic behavior by sending the same prompt in batch sizes ranging from 1 to n_trials.
        texts = []