Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c16b33cc
Unverified
Commit
c16b33cc
authored
Mar 18, 2025
by
Yineng Zhang
Committed by
GitHub
Mar 18, 2025
Browse files
cleanup deps 3/n (#4541)
parent
2d004512
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
16 additions
and
12 deletions
+16
-12
python/sglang/srt/layers/quantization/fp8.py
python/sglang/srt/layers/quantization/fp8.py
+0
-4
python/sglang/srt/models/deepseek_nextn.py
python/sglang/srt/models/deepseek_nextn.py
+2
-1
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+2
-1
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+12
-6
No files found.
python/sglang/srt/layers/quantization/fp8.py
View file @
c16b33cc
...
@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
...
@@ -152,8 +152,6 @@ class Fp8Config(QuantizationConfig):
def
get_quant_method
(
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
)
->
Optional
[
"QuantizeMethodBase"
]:
from
vllm.attention.layer
import
Attention
# Avoid circular import
from
sglang.srt.layers.moe.fused_moe_triton
import
FusedMoE
from
sglang.srt.layers.moe.fused_moe_triton
import
FusedMoE
if
isinstance
(
layer
,
LinearBase
):
if
isinstance
(
layer
,
LinearBase
):
...
@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
...
@@ -162,8 +160,6 @@ class Fp8Config(QuantizationConfig):
return
Fp8LinearMethod
(
self
)
return
Fp8LinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
elif
isinstance
(
layer
,
FusedMoE
):
return
Fp8MoEMethod
(
self
)
return
Fp8MoEMethod
(
self
)
elif
isinstance
(
layer
,
Attention
):
return
Fp8KVCacheMethod
(
self
)
return
None
return
None
def
get_scaled_act_names
(
self
)
->
List
[
str
]:
def
get_scaled_act_names
(
self
)
->
List
[
str
]:
...
...
python/sglang/srt/models/deepseek_nextn.py
View file @
c16b33cc
...
@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
...
@@ -18,7 +18,6 @@ from typing import Iterable, Optional, Tuple
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm
import
_custom_ops
as
ops
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
ReplicatedLinear
from
sglang.srt.layers.linear
import
ReplicatedLinear
...
@@ -48,6 +47,8 @@ _is_cuda = is_cuda()
...
@@ -48,6 +47,8 @@ _is_cuda = is_cuda()
if
_is_cuda
:
if
_is_cuda
:
from
sgl_kernel
import
awq_dequantize
from
sgl_kernel
import
awq_dequantize
else
:
from
vllm
import
_custom_ops
as
ops
class
DeepseekModelNextN
(
nn
.
Module
):
class
DeepseekModelNextN
(
nn
.
Module
):
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
c16b33cc
...
@@ -23,7 +23,6 @@ import torch
...
@@ -23,7 +23,6 @@ import torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm
import
_custom_ops
as
ops
from
sglang.srt.distributed
import
(
from
sglang.srt.distributed
import
(
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
...
@@ -75,6 +74,8 @@ _is_cuda = is_cuda()
...
@@ -75,6 +74,8 @@ _is_cuda = is_cuda()
if
_is_cuda
:
if
_is_cuda
:
from
sgl_kernel
import
awq_dequantize
,
bmm_fp8
from
sgl_kernel
import
awq_dequantize
,
bmm_fp8
else
:
from
vllm
import
_custom_ops
as
ops
class
DeepseekV2MLP
(
nn
.
Module
):
class
DeepseekV2MLP
(
nn
.
Module
):
...
...
python/sglang/srt/utils.py
View file @
c16b33cc
...
@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):
...
@@ -531,7 +531,10 @@ def load_image(image_file: Union[str, bytes]):
def
suppress_other_loggers
():
def
suppress_other_loggers
():
from
vllm.logger
import
logger
as
vllm_default_logger
try
:
from
vllm.logger
import
logger
as
vllm_default_logger
except
ImportError
:
return
vllm_default_logger
.
setLevel
(
logging
.
WARN
)
vllm_default_logger
.
setLevel
(
logging
.
WARN
)
logging
.
getLogger
(
"vllm.distributed.device_communicators.pynccl"
).
setLevel
(
logging
.
getLogger
(
"vllm.distributed.device_communicators.pynccl"
).
setLevel
(
...
@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():
...
@@ -620,11 +623,14 @@ def monkey_patch_p2p_access_check():
def
monkey_patch_vllm_gguf_config
():
def
monkey_patch_vllm_gguf_config
():
from
vllm.model_executor.layers.quantization.gguf
import
(
try
:
GGUFConfig
,
from
vllm.model_executor.layers.quantization.gguf
import
(
GGUFEmbeddingMethod
,
GGUFConfig
,
GGUFLinearMethod
,
GGUFEmbeddingMethod
,
)
GGUFLinearMethod
,
)
except
ImportError
:
return
from
sglang.srt.layers.linear
import
LinearBase
from
sglang.srt.layers.linear
import
LinearBase
from
sglang.srt.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
from
sglang.srt.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment