[misc] depdencies & enviroment flag (#12113)

8491c794 · Liangsheng Yin · GitHub · bda3758f · 8491c794 · 8491c794
Unverified Commit 8491c794 authored Oct 26, 2025 by Liangsheng Yin Committed by GitHub Oct 26, 2025
4 changed files
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -81,7 +81,6 @@ modelopt = ["nvidia-modelopt"]
 test = [
  "accelerate",
  "expecttest",
-  "gguf",
  "jsonlines",
  "matplotlib",
  "pandas",

--- a/python/sglang/srt/environ.py
+++ b/python/sglang/srt/environ.py
@@ -231,6 +231,7 @@ class Envs:
    SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
    # Overlap Spec V2
+    SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
    SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
    # VLM

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -27,6 +27,7 @@ from typing import Dict, List, Literal, Optional, Union
 import orjson
 from sglang.srt.connector import ConnectorType
+from sglang.srt.environ import envs
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.parser.reasoning_parser import ReasoningParser
@@ -342,7 +343,6 @@ class ServerArgs:
    nsa_decode_backend: str = "fa3"
    # Speculative decoding
-    enable_beta_spec: bool = False
    speculative_algorithm: Optional[str] = None
    speculative_draft_model_path: Optional[str] = None
    speculative_draft_model_revision: Optional[str] = None
@@ -1431,13 +1431,16 @@ class ServerArgs:
                    "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
                )
-            if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
+            if (
+                self.speculative_algorithm == "EAGLE"
+                and envs.SGLANG_ENABLE_SPEC_V2.get()
+            ):
                self.disable_overlap_schedule = False
                logger.warning(
                    "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
                )
-            if not self.enable_beta_spec:
+            if not envs.SGLANG_ENABLE_SPEC_V2.get():
                self.disable_overlap_schedule = True
                logger.warning(
                    "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
@@ -2573,7 +2576,6 @@ class ServerArgs:
        )
        # Speculative decoding
-        parser.add_argument("--enable-beta-spec", action="store_true")
        parser.add_argument(
            "--speculative-algorithm",
            type=str,

--- a/test/srt/test_eagle_infer_beta.py
+++ b/test/srt/test_eagle_infer_beta.py
 import unittest
 from types import SimpleNamespace
+from sglang.srt.environ import envs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
 from sglang.test.kit_matched_stop import MatchedStopMixin
@@ -29,7 +30,6 @@ class TestEagleServerBase(CustomTestCase, MatchedStopMixin):
    def setUpClass(cls):
        cls.base_url = DEFAULT_URL_FOR_TEST
        launch_args = [
-            "--enable-beta-spec",
            "--trust-remote-code",
            "--attention-backend",
            cls.attention_backend,
@@ -53,12 +53,13 @@ class TestEagleServerBase(CustomTestCase, MatchedStopMixin):
            *[str(i) for i in range(1, cls.max_running_requests + 1)],
        ]
        launch_args.extend(cls.other_launch_args)
-        cls.process = popen_launch_server(
+        with envs.SGLANG_ENABLE_SPEC_V2.override(True):
-            cls.model,
+            cls.process = popen_launch_server(
-            cls.base_url,
+                cls.model,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+                cls.base_url,
-            other_args=launch_args,
+                timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-        )
+                other_args=launch_args,
+            )
    @classmethod
    def tearDownClass(cls):