Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
177320a5
"vscode:/vscode.git/clone" did not exist on "c8958d6f119dae91d9225cd3011cf54721f5ef6a"
Unverified
Commit
177320a5
authored
Apr 16, 2025
by
Lianmin Zheng
Committed by
GitHub
Apr 16, 2025
Browse files
Clean up imports (#5467)
parent
d7bc19a4
Changes
51
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
24 additions
and
49 deletions
+24
-49
python/sglang/srt/models/deepseek_nextn.py
python/sglang/srt/models/deepseek_nextn.py
+2
-2
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+5
-6
python/sglang/srt/reasoning_parser.py
python/sglang/srt/reasoning_parser.py
+0
-1
python/sglang/srt/sampling/sampling_batch_info.py
python/sglang/srt/sampling/sampling_batch_info.py
+2
-3
python/sglang/srt/server.py
python/sglang/srt/server.py
+0
-18
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-10
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+3
-1
python/sglang/test/runners.py
python/sglang/test/runners.py
+1
-1
python/sglang/test/test_custom_ops.py
python/sglang/test/test_custom_ops.py
+1
-1
test/srt/test_fp8_kernel.py
test/srt/test_fp8_kernel.py
+1
-3
test/srt/test_triton_moe_channel_fp8_kernel.py
test/srt/test_triton_moe_channel_fp8_kernel.py
+3
-3
No files found.
python/sglang/srt/models/deepseek_nextn.py
View file @
177320a5
...
...
@@ -48,7 +48,7 @@ _is_cuda = is_cuda()
if
_is_cuda
:
from
sgl_kernel
import
awq_dequantize
else
:
from
vllm
import
_custom_ops
as
ops
from
vllm
.
_custom_ops
import
awq_dequantize
class
DeepseekModelNextN
(
nn
.
Module
):
...
...
@@ -273,7 +273,7 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
self_attn
.
kv_b_proj
.
qzeros
,
).
T
else
:
w
=
ops
.
awq_dequantize
(
w
=
awq_dequantize
(
self_attn
.
kv_b_proj
.
qweight
,
self_attn
.
kv_b_proj
.
scales
,
self_attn
.
kv_b_proj
.
qzeros
,
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
177320a5
...
...
@@ -51,6 +51,7 @@ from sglang.srt.layers.linear import (
)
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
from
sglang.srt.layers.moe.ep_moe.layer
import
DeepEPMoE
,
EPMoE
from
sglang.srt.layers.moe.ep_moe.token_dispatcher
import
DeepEPDispatcher
from
sglang.srt.layers.moe.fused_moe_triton
import
FusedMoE
from
sglang.srt.layers.moe.topk
import
select_experts
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
...
...
@@ -80,10 +81,8 @@ _is_cuda = is_cuda()
if
_is_cuda
:
from
sgl_kernel
import
awq_dequantize
,
bmm_fp8
,
merge_state_v2
from
sglang.srt.layers.moe.ep_moe.token_dispatcher
import
DeepEPDispatcher
else
:
from
vllm
import
_custom_ops
as
ops
from
vllm
.
_custom_ops
import
awq_dequantize
if
_is_hip
:
from
sglang.srt.layers.attention.triton_ops.rocm_mla_decode_rope
import
(
...
...
@@ -861,7 +860,7 @@ class DeepseekV2AttentionMLA(nn.Module):
)
elif
self
.
w_kc
.
dtype
==
torch
.
float8_e4m3fn
:
q_nope_val
,
q_nope_scale
=
per_tensor_quant_mla_fp8
(
q_nope
.
transpose
(
0
,
1
),
dtype
=
torch
.
float8_e4m3fn
q_nope
.
transpose
(
0
,
1
),
)
q_nope_out
=
bmm_fp8
(
q_nope_val
,
self
.
w_kc
,
q_nope_scale
,
self
.
w_scale
,
torch
.
bfloat16
...
...
@@ -892,7 +891,7 @@ class DeepseekV2AttentionMLA(nn.Module):
)
elif
self
.
w_vc
.
dtype
==
torch
.
float8_e4m3fn
:
attn_output_val
,
attn_output_scale
=
per_tensor_quant_mla_fp8
(
attn_output
.
transpose
(
0
,
1
),
dtype
=
torch
.
float8_e4m3fn
attn_output
.
transpose
(
0
,
1
),
)
attn_bmm_output
=
bmm_fp8
(
attn_output_val
,
...
...
@@ -1565,7 +1564,7 @@ class DeepseekV2ForCausalLM(nn.Module):
self_attn
.
kv_b_proj
.
qzeros
,
).
T
else
:
w
=
ops
.
awq_dequantize
(
w
=
awq_dequantize
(
self_attn
.
kv_b_proj
.
qweight
,
self_attn
.
kv_b_proj
.
scales
,
self_attn
.
kv_b_proj
.
qzeros
,
...
...
python/sglang/srt/reasoning_parser.py
View file @
177320a5
import
re
from
typing
import
Dict
,
Tuple
...
...
python/sglang/srt/sampling/sampling_batch_info.py
View file @
177320a5
...
...
@@ -10,12 +10,11 @@ import torch
import
sglang.srt.sampling.penaltylib
as
penaltylib
from
sglang.srt.sampling.custom_logit_processor
import
CustomLogitProcessor
logger
=
logging
.
getLogger
(
__name__
)
if
TYPE_CHECKING
:
from
sglang.srt.managers.schedule_batch
import
ScheduleBatch
logger
=
logging
.
getLogger
(
__name__
)
@
dataclasses
.
dataclass
class
SamplingBatchInfo
:
...
...
python/sglang/srt/server.py
deleted
100644 → 0
View file @
d7bc19a4
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Some shortcuts for backward compatibility.
# They will be removed in new versions.
from
sglang.srt.entrypoints.engine
import
Engine
from
sglang.srt.entrypoints.http_server
import
kill_process_tree
,
launch_server
python/sglang/srt/server_args.py
View file @
177320a5
...
...
@@ -187,6 +187,7 @@ class ServerArgs:
n_share_experts_fusion
:
int
=
0
disable_shared_experts_fusion
:
bool
=
False
disable_chunked_prefix_cache
:
bool
=
False
disable_fast_image_processor
:
bool
=
False
# Debug tensor dumps
debug_tensor_dump_output_folder
:
Optional
[
str
]
=
None
...
...
@@ -198,9 +199,6 @@ class ServerArgs:
disaggregation_bootstrap_port
:
int
=
8998
disaggregation_transfer_backend
:
str
=
"mooncake"
# multimodal
disable_fast_image_processor
:
bool
=
False
def
__post_init__
(
self
):
# Expert parallelism
if
self
.
enable_ep_moe
:
...
...
@@ -1136,6 +1134,11 @@ class ServerArgs:
action
=
"store_true"
,
help
=
"Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences."
,
)
parser
.
add_argument
(
"--disable-fast-image-processor"
,
action
=
"store_true"
,
help
=
"Adopt base image processor instead of fast image processor."
,
)
# Server warmups
parser
.
add_argument
(
...
...
@@ -1187,13 +1190,6 @@ class ServerArgs:
help
=
"The backend for disaggregation transfer. Default is mooncake."
,
)
# Multimodal
parser
.
add_argument
(
"--disable-fast-image-processor"
,
action
=
"store_true"
,
help
=
"Adopt base image processor instead of fast image processor."
,
)
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
args
.
tp_size
=
args
.
tensor_parallel_size
...
...
python/sglang/srt/utils.py
View file @
177320a5
...
...
@@ -55,7 +55,6 @@ import torch.distributed
import
torch.distributed
as
dist
import
triton
import
zmq
from
decord
import
VideoReader
,
cpu
from
fastapi.responses
import
ORJSONResponse
from
packaging
import
version
as
pkg_version
from
PIL
import
Image
...
...
@@ -545,6 +544,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
def
encode_video
(
video_path
,
frame_count_limit
=
None
):
# Lazy import because decord is not available on some arm platforms.
from
decord
import
VideoReader
,
cpu
if
not
os
.
path
.
exists
(
video_path
):
logger
.
error
(
f
"Video
{
video_path
}
does not exist"
)
return
[]
...
...
python/sglang/test/runners.py
View file @
177320a5
...
...
@@ -26,8 +26,8 @@ from transformers import (
AutoProcessor
,
)
from
sglang.srt.entrypoints.engine
import
Engine
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
from
sglang.srt.server
import
Engine
from
sglang.srt.utils
import
load_image
from
sglang.test.test_utils
import
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
,
calculate_rouge_l
...
...
python/sglang/test/test_custom_ops.py
View file @
177320a5
...
...
@@ -3,7 +3,7 @@
import
pytest
import
torch
from
sglang.srt.
custom_op
import
scaled_fp8_quant
from
sglang.srt.
layers.quantization.fp8_kernel
import
scaled_fp8_quant
from
sglang.srt.utils
import
is_cuda
...
...
test/srt/test_fp8_kernel.py
View file @
177320a5
...
...
@@ -93,9 +93,7 @@ class TestPerTokenGroupQuantFP8(TestFP8Base):
A
,
A_quant_gt
,
scale_gt
=
self
.
_make_A
(
M
=
self
.
M
,
K
=
self
.
K
,
group_size
=
self
.
group_size
,
out_dtype
=
self
.
quant_type
)
A_quant
,
scale
=
per_token_group_quant_fp8
(
x
=
A
,
group_size
=
self
.
group_size
,
dtype
=
self
.
quant_type
)
A_quant
,
scale
=
per_token_group_quant_fp8
(
x
=
A
,
group_size
=
self
.
group_size
)
torch
.
testing
.
assert_close
(
scale
,
scale_gt
)
diff
=
(
A_quant
.
to
(
torch
.
float16
)
-
A_quant_gt
.
to
(
torch
.
float16
)).
abs
()
diff_count
=
(
diff
>
1e-5
).
count_nonzero
()
...
...
test/srt/test_triton_moe_channel_fp8_kernel.py
View file @
177320a5
...
...
@@ -3,9 +3,9 @@ import unittest
import
torch
from
sglang.srt.custom_op
import
scaled_fp8_quant
as
sgl_scaled_fp8_quant
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.moe.fused_moe_triton.fused_moe
import
fused_moe
from
sglang.srt.layers.quantization.fp8_kernel
import
scaled_fp8_quant
from
sglang.test.test_utils
import
CustomTestCase
...
...
@@ -41,7 +41,7 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
B
,
D
=
a
.
shape
# Perform per-token quantization
a_q
,
a_s
=
sgl_
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
a_q
,
a_s
=
scaled_fp8_quant
(
a
,
use_per_token_if_dynamic
=
True
)
# Repeat tokens to match topk
a_q
=
a_q
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
# Also repeat the scale
...
...
@@ -69,7 +69,7 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
# Activation function
act_out
=
SiluAndMul
().
forward_native
(
inter_out
)
# Quantize activation output with per-token
act_out_q
,
act_out_s
=
sgl_
scaled_fp8_quant
(
act_out_q
,
act_out_s
=
scaled_fp8_quant
(
act_out
,
use_per_token_if_dynamic
=
True
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment