Unverified Commit 8491c794 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

[misc] depdencies & enviroment flag (#12113)

parent bda3758f
...@@ -81,7 +81,6 @@ modelopt = ["nvidia-modelopt"] ...@@ -81,7 +81,6 @@ modelopt = ["nvidia-modelopt"]
test = [ test = [
"accelerate", "accelerate",
"expecttest", "expecttest",
"gguf",
"jsonlines", "jsonlines",
"matplotlib", "matplotlib",
"pandas", "pandas",
......
...@@ -231,6 +231,7 @@ class Envs: ...@@ -231,6 +231,7 @@ class Envs:
SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256) SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
# Overlap Spec V2 # Overlap Spec V2
SGLANG_ENABLE_SPEC_V2 = EnvBool(False)
SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False) SGLANG_ENABLE_OVERLAP_PLAN_STREAM = EnvBool(False)
# VLM # VLM
......
...@@ -27,6 +27,7 @@ from typing import Dict, List, Literal, Optional, Union ...@@ -27,6 +27,7 @@ from typing import Dict, List, Literal, Optional, Union
import orjson import orjson
from sglang.srt.connector import ConnectorType from sglang.srt.connector import ConnectorType
from sglang.srt.environ import envs
from sglang.srt.function_call.function_call_parser import FunctionCallParser from sglang.srt.function_call.function_call_parser import FunctionCallParser
from sglang.srt.lora.lora_registry import LoRARef from sglang.srt.lora.lora_registry import LoRARef
from sglang.srt.parser.reasoning_parser import ReasoningParser from sglang.srt.parser.reasoning_parser import ReasoningParser
...@@ -342,7 +343,6 @@ class ServerArgs: ...@@ -342,7 +343,6 @@ class ServerArgs:
nsa_decode_backend: str = "fa3" nsa_decode_backend: str = "fa3"
# Speculative decoding # Speculative decoding
enable_beta_spec: bool = False
speculative_algorithm: Optional[str] = None speculative_algorithm: Optional[str] = None
speculative_draft_model_path: Optional[str] = None speculative_draft_model_path: Optional[str] = None
speculative_draft_model_revision: Optional[str] = None speculative_draft_model_revision: Optional[str] = None
...@@ -1431,13 +1431,16 @@ class ServerArgs: ...@@ -1431,13 +1431,16 @@ class ServerArgs:
"Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests." "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
) )
if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec: if (
self.speculative_algorithm == "EAGLE"
and envs.SGLANG_ENABLE_SPEC_V2.get()
):
self.disable_overlap_schedule = False self.disable_overlap_schedule = False
logger.warning( logger.warning(
"Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on." "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
) )
if not self.enable_beta_spec: if not envs.SGLANG_ENABLE_SPEC_V2.get():
self.disable_overlap_schedule = True self.disable_overlap_schedule = True
logger.warning( logger.warning(
"Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding." "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
...@@ -2573,7 +2576,6 @@ class ServerArgs: ...@@ -2573,7 +2576,6 @@ class ServerArgs:
) )
# Speculative decoding # Speculative decoding
parser.add_argument("--enable-beta-spec", action="store_true")
parser.add_argument( parser.add_argument(
"--speculative-algorithm", "--speculative-algorithm",
type=str, type=str,
......
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.environ import envs
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.kit_matched_stop import MatchedStopMixin from sglang.test.kit_matched_stop import MatchedStopMixin
...@@ -29,7 +30,6 @@ class TestEagleServerBase(CustomTestCase, MatchedStopMixin): ...@@ -29,7 +30,6 @@ class TestEagleServerBase(CustomTestCase, MatchedStopMixin):
def setUpClass(cls): def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
launch_args = [ launch_args = [
"--enable-beta-spec",
"--trust-remote-code", "--trust-remote-code",
"--attention-backend", "--attention-backend",
cls.attention_backend, cls.attention_backend,
...@@ -53,12 +53,13 @@ class TestEagleServerBase(CustomTestCase, MatchedStopMixin): ...@@ -53,12 +53,13 @@ class TestEagleServerBase(CustomTestCase, MatchedStopMixin):
*[str(i) for i in range(1, cls.max_running_requests + 1)], *[str(i) for i in range(1, cls.max_running_requests + 1)],
] ]
launch_args.extend(cls.other_launch_args) launch_args.extend(cls.other_launch_args)
cls.process = popen_launch_server( with envs.SGLANG_ENABLE_SPEC_V2.override(True):
cls.model, cls.process = popen_launch_server(
cls.base_url, cls.model,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, cls.base_url,
other_args=launch_args, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
) other_args=launch_args,
)
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment