Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
9a44b643
Unverified
Commit
9a44b643
authored
Aug 09, 2025
by
Lianmin Zheng
Committed by
GitHub
Aug 09, 2025
Browse files
Fix CI (#9012)
parent
41d71ca4
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
24 additions
and
20 deletions
+24
-20
.github/workflows/vllm-dependency-test.yml
.github/workflows/vllm-dependency-test.yml
+9
-3
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+2
-2
python/sglang/srt/entrypoints/openai/tool_server.py
python/sglang/srt/entrypoints/openai/tool_server.py
+4
-3
python/sglang/srt/layers/moe/fused_moe_triton/layer.py
python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+1
-0
python/sglang/srt/layers/quantization/__init__.py
python/sglang/srt/layers/quantization/__init__.py
+4
-2
python/sglang/srt/layers/quantization/modelopt_quant.py
python/sglang/srt/layers/quantization/modelopt_quant.py
+2
-7
python/sglang/srt/managers/multimodal_processor.py
python/sglang/srt/managers/multimodal_processor.py
+1
-1
python/sglang/srt/models/registry.py
python/sglang/srt/models/registry.py
+1
-1
test/srt/test_utils_update_weights.py
test/srt/test_utils_update_weights.py
+0
-1
No files found.
.github/workflows/vllm-dependency-test.yml
View file @
9a44b643
...
@@ -30,13 +30,19 @@ jobs:
...
@@ -30,13 +30,19 @@ jobs:
-
name
:
Install dependencies
-
name
:
Install dependencies
run
:
|
run
:
|
bash scripts/ci_install_dependency.sh
bash scripts/ci_install_dependency.sh
pip install "vllm==0.9.0"
pip install "vllm==0.10.0"
pip install "bitsandbytes>=0.44.0"
pip install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu126
pip install "openai==1.99.1"
pip install "openai==1.99.1"
pip install "bitsandbytes>=0.44.0"
# NOTE: The latest sgl-kernel depends on torch 2.8.0 but the latest vllm depends on torch 2.7.0
# so they are not compatible. Here we install the old sgl-kernel to make the test pass.
# TODO: remove this once vllm supports torch 2.8.0.
pip install "sgl-kernel==0.2.9"
-
name
:
Run vLLM dependency tests
-
name
:
Run vLLM dependency tests
timeout-minutes
:
60
timeout-minutes
:
60
run
:
|
run
:
|
export SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK=1
cd test/srt
cd test/srt
python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
python3 run_suite.py --suite vllm_dependency_test --timeout-per-file 3600
python/sglang/srt/entrypoints/engine.py
View file @
9a44b643
...
@@ -67,6 +67,7 @@ from sglang.srt.utils import (
...
@@ -67,6 +67,7 @@ from sglang.srt.utils import (
MultiprocessingSerializer
,
MultiprocessingSerializer
,
assert_pkg_version
,
assert_pkg_version
,
configure_logger
,
configure_logger
,
get_bool_env_var
,
get_zmq_socket
,
get_zmq_socket
,
is_cuda
,
is_cuda
,
kill_process_tree
,
kill_process_tree
,
...
@@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs):
...
@@ -627,7 +628,6 @@ def _set_envs_and_config(server_args: ServerArgs):
os
.
environ
[
"NCCL_CUMEM_ENABLE"
]
=
str
(
int
(
server_args
.
enable_symm_mem
))
os
.
environ
[
"NCCL_CUMEM_ENABLE"
]
=
str
(
int
(
server_args
.
enable_symm_mem
))
if
not
server_args
.
enable_symm_mem
:
if
not
server_args
.
enable_symm_mem
:
os
.
environ
[
"NCCL_NVLS_ENABLE"
]
=
str
(
int
(
server_args
.
enable_nccl_nvls
))
os
.
environ
[
"NCCL_NVLS_ENABLE"
]
=
str
(
int
(
server_args
.
enable_nccl_nvls
))
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
os
.
environ
[
"CUDA_DEVICE_MAX_CONNECTIONS"
]
=
"4"
os
.
environ
[
"CUDA_DEVICE_MAX_CONNECTIONS"
]
=
"4"
os
.
environ
[
"CUDA_MODULE_LOADING"
]
=
"AUTO"
os
.
environ
[
"CUDA_MODULE_LOADING"
]
=
"AUTO"
...
@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
...
@@ -647,7 +647,7 @@ def _set_envs_and_config(server_args: ServerArgs):
"reinstall the latest version by following the instructions "
"reinstall the latest version by following the instructions "
"at https://docs.flashinfer.ai/installation.html."
,
"at https://docs.flashinfer.ai/installation.html."
,
)
)
if
_is_cuda
:
if
_is_cuda
and
not
get_bool_env_var
(
"SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"
)
:
assert_pkg_version
(
assert_pkg_version
(
"sgl-kernel"
,
"sgl-kernel"
,
"0.3.3"
,
"0.3.3"
,
...
...
python/sglang/srt/entrypoints/openai/tool_server.py
View file @
9a44b643
...
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
...
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
from
contextlib
import
AbstractAsyncContextManager
,
asynccontextmanager
from
contextlib
import
AbstractAsyncContextManager
,
asynccontextmanager
from
typing
import
Any
from
typing
import
Any
logger
=
logging
.
getLogger
(
__name__
)
try
:
try
:
from
mcp
import
ClientSession
from
mcp
import
ClientSession
from
mcp.client.sse
import
sse_client
from
mcp.client.sse
import
sse_client
from
mcp.types
import
ListToolsResult
from
mcp.types
import
ListToolsResult
except
ImportError
:
except
ImportError
as
e
:
logger
.
warning
(
"Ignoring mcp import error"
)
ClientSession
=
sse_client
=
ListToolsResult
=
e
from
openai_harmony
import
ToolDescription
,
ToolNamespaceConfig
from
openai_harmony
import
ToolDescription
,
ToolNamespaceConfig
logger
=
logging
.
getLogger
(
__name__
)
async
def
list_server_and_tools
(
server_url
:
str
):
async
def
list_server_and_tools
(
server_url
:
str
):
...
...
python/sglang/srt/layers/moe/fused_moe_triton/layer.py
View file @
9a44b643
...
@@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module):
...
@@ -147,6 +147,7 @@ class FusedMoE(torch.nn.Module):
self
.
layer_id
=
layer_id
self
.
layer_id
=
layer_id
self
.
top_k
=
top_k
self
.
top_k
=
top_k
self
.
hidden_size
=
hidden_size
self
.
num_experts
=
num_experts
self
.
num_experts
=
num_experts
self
.
num_fused_shared_experts
=
num_fused_shared_experts
self
.
num_fused_shared_experts
=
num_fused_shared_experts
self
.
expert_map_cpu
=
None
self
.
expert_map_cpu
=
None
...
...
python/sglang/srt/layers/quantization/__init__.py
View file @
9a44b643
...
@@ -26,8 +26,9 @@ try:
...
@@ -26,8 +26,9 @@ try:
from
vllm.model_executor.layers.quantization.tpu_int8
import
Int8TpuConfig
from
vllm.model_executor.layers.quantization.tpu_int8
import
Int8TpuConfig
VLLM_AVAILABLE
=
True
VLLM_AVAILABLE
=
True
except
ImportError
:
except
ImportError
as
e
:
VLLM_AVAILABLE
=
False
VLLM_AVAILABLE
=
False
VLLM_IMPORT_ERROR
=
e
# Define empty classes as placeholders when vllm is not available
# Define empty classes as placeholders when vllm is not available
class
DummyConfig
:
class
DummyConfig
:
...
@@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
...
@@ -137,7 +138,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if
quantization
in
VLLM_QUANTIZATION_METHODS
and
not
VLLM_AVAILABLE
:
if
quantization
in
VLLM_QUANTIZATION_METHODS
and
not
VLLM_AVAILABLE
:
raise
ValueError
(
raise
ValueError
(
f
"
{
quantization
}
quantization requires some operators from vllm. "
f
"
{
quantization
}
quantization requires some operators from vllm. "
"Please install vllm by `pip install vllm==0.9.0.1`"
f
"Please install vllm by `pip install vllm==0.9.0.1`
\n
"
f
"Import error:
{
VLLM_IMPORT_ERROR
}
"
)
)
return
QUANTIZATION_METHODS
[
quantization
]
return
QUANTIZATION_METHODS
[
quantization
]
...
...
python/sglang/srt/layers/quantization/modelopt_quant.py
View file @
9a44b643
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
from
__future__
import
annotations
from
__future__
import
annotations
import
importlib.util
import
logging
import
logging
from
typing
import
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
List
,
Optional
,
Union
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
import
torch
import
torch
from
torch.nn.parameter
import
Parameter
from
torch.nn.parameter
import
Parameter
...
@@ -42,11 +41,7 @@ if is_cuda():
...
@@ -42,11 +41,7 @@ if is_cuda():
try
:
try
:
from
flashinfer
import
mm_fp4
as
fp4_gemm
from
flashinfer
import
mm_fp4
as
fp4_gemm
from
flashinfer
import
(
from
flashinfer
import
reorder_rows_for_gated_act_gemm
,
shuffle_matrix_sf_a
reorder_rows_for_gated_act_gemm
,
shuffle_matrix_a
,
shuffle_matrix_sf_a
,
)
enable_flashinfer_fp4_gemm
=
True
enable_flashinfer_fp4_gemm
=
True
except
ImportError
:
except
ImportError
:
...
...
python/sglang/srt/managers/multimodal_processor.py
View file @
9a44b643
...
@@ -20,7 +20,7 @@ def import_processors():
...
@@ -20,7 +20,7 @@ def import_processors():
try
:
try
:
module
=
importlib
.
import_module
(
name
)
module
=
importlib
.
import_module
(
name
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
warning
(
f
"Ignore import error when loading
{
name
}
:
"
f
"
{
e
}
"
)
logger
.
warning
(
f
"Ignore import error when loading
{
name
}
:
{
e
}
"
)
continue
continue
all_members
=
inspect
.
getmembers
(
module
,
inspect
.
isclass
)
all_members
=
inspect
.
getmembers
(
module
,
inspect
.
isclass
)
classes
=
[
classes
=
[
...
...
python/sglang/srt/models/registry.py
View file @
9a44b643
...
@@ -83,7 +83,7 @@ def import_model_classes():
...
@@ -83,7 +83,7 @@ def import_model_classes():
try
:
try
:
module
=
importlib
.
import_module
(
name
)
module
=
importlib
.
import_module
(
name
)
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
warning
(
f
"Ignore import error when loading
{
name
}
. "
f
"
{
e
}
"
)
logger
.
warning
(
f
"Ignore import error when loading
{
name
}
:
{
e
}
"
)
continue
continue
if
hasattr
(
module
,
"EntryClass"
):
if
hasattr
(
module
,
"EntryClass"
):
entry
=
module
.
EntryClass
entry
=
module
.
EntryClass
...
...
test/srt/test_utils_update_weights.py
View file @
9a44b643
...
@@ -83,7 +83,6 @@ class TestUtilsUpdateWeights(unittest.TestCase):
...
@@ -83,7 +83,6 @@ class TestUtilsUpdateWeights(unittest.TestCase):
# Set up environment variables
# Set up environment variables
os
.
environ
[
"TF_CPP_MIN_LOG_LEVEL"
]
=
"3"
os
.
environ
[
"TF_CPP_MIN_LOG_LEVEL"
]
=
"3"
os
.
environ
[
"NCCL_CUMEM_ENABLE"
]
=
"0"
os
.
environ
[
"NCCL_CUMEM_ENABLE"
]
=
"0"
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
os
.
environ
[
"CUDA_DEVICE_MAX_CONNECTIONS"
]
=
"4"
os
.
environ
[
"CUDA_DEVICE_MAX_CONNECTIONS"
]
=
"4"
os
.
environ
[
"CUDA_MODULE_LOADING"
]
=
"AUTO"
os
.
environ
[
"CUDA_MODULE_LOADING"
]
=
"AUTO"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment