Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
00d25a7f
"src/targets/vscode:/vscode.git/clone" did not exist on "b4b8909ee922763803a6fddb6dd34a4f11018068"
Unverified
Commit
00d25a7f
authored
Mar 10, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 10, 2025
Browse files
Fix quantization and nightly tests (#4258)
parent
1a5023e0
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
142 additions
and
70 deletions
+142
-70
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+0
-1
python/sglang/srt/layers/quantization/__init__.py
python/sglang/srt/layers/quantization/__init__.py
+88
-68
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+4
-0
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+4
-1
test/srt/run_suite.py
test/srt/run_suite.py
+1
-0
test/srt/test_awq.py
test/srt/test_awq.py
+44
-0
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+1
-0
No files found.
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
View file @
00d25a7f
...
...
@@ -23,7 +23,6 @@ from sglang.srt.utils import (
direct_register_custom_op
,
get_bool_env_var
,
get_device_name
,
is_cuda_available
,
is_hip
,
)
...
...
python/sglang/srt/layers/quantization/__init__.py
View file @
00d25a7f
# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
import
builtins
import
inspect
import
re
from
copy
import
deepcopy
from
typing
import
Callable
,
Dict
,
Optional
,
Type
,
Union
...
...
@@ -6,10 +8,7 @@ from typing import Callable, Dict, Optional, Type, Union
import
torch
from
vllm.model_executor.layers.quantization.aqlm
import
AQLMConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
AWQMarlinConfig
,
AWQMoEMethod
,
)
from
vllm.model_executor.layers.quantization.awq_marlin
import
AWQMarlinConfig
from
vllm.model_executor.layers.quantization.bitsandbytes
import
BitsAndBytesConfig
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
CompressedTensorsConfig
,
...
...
@@ -180,96 +179,117 @@ def gptq_get_quant_method(self, layer, prefix):
return
None
def
awq_get_quant_method
(
self
,
layer
,
prefix
):
from
vllm.model_executor.layers.quantization.awq
import
is_layer_skipped_awq
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
AWQMarlinLinearMethod
,
AWQMoEMethod
,
original_isinstance
=
builtins
.
isinstance
def
monkey_patch_isinstance_for_vllm_base_layer
(
reverse
:
bool
=
False
):
"""
Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
can recognize sglang layers
"""
if
reverse
:
builtins
.
isinstance
=
original_isinstance
return
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.linear
import
LinearBase
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
,
)
from
sglang.srt.layers.linear
import
LinearBase
,
UnquantizedLinearMethod
from
sglang.srt.layers.moe.fused_moe_triton.layer
import
FusedMoE
from
sglang.srt.layers.vocab_parallel_embedding
import
ParallelLMHead
from
sglang.srt.layers.linear
import
LinearBase
as
PatchedLinearBase
from
sglang.srt.layers.moe.fused_moe_triton.layer
import
FusedMoE
as
PatchedFusedMoE
from
sglang.srt.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
as
PatchedVocabParallelEmbedding
,
)
if
isinstance
(
layer
,
LinearBase
)
or
(
isinstance
(
layer
,
ParallelLMHead
)
and
self
.
lm_head_quantized
):
if
is_layer_skipped_awq
(
prefix
,
self
.
modules_to_not_convert
):
return
UnquantizedLinearMethod
()
return
AWQMarlinLinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
return
AWQMoEMethod
(
self
)
return
None
def
patched_isinstance
(
obj
,
classinfo
):
if
classinfo
is
LinearBase
:
return
original_isinstance
(
obj
,
PatchedLinearBase
)
if
classinfo
is
FusedMoE
:
return
original_isinstance
(
obj
,
PatchedFusedMoE
)
if
classinfo
is
VocabParallelEmbedding
:
return
original_isinstance
(
obj
,
PatchedVocabParallelEmbedding
)
return
original_isinstance
(
obj
,
classinfo
)
builtins
.
isinstance
=
patched_isinstance
original_awq_moe_method_apply
=
AWQMoEMethod
.
apply
def
monkey_patch_moe_apply
(
class_obj
:
"FusedMoEMethodBase"
):
"""
Monkey patch the apply function of vllm's FusedMoEMethodBase.
Convert sglang arguments to vllm arguments.
"""
original_apply
=
class_obj
.
apply
sig
=
inspect
.
signature
(
original_apply
)
param_names
=
list
(
sig
.
parameters
.
keys
())
has_correction_bias
=
"e_score_correction_bias"
in
param_names
def
awq_moe_method
_apply
(
def
new
_apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
top_k
:
int
,
renormalize
:
bool
,
use_grouped_topk
:
bool
=
False
,
use_grouped_topk
:
bool
,
topk_group
:
Optional
[
int
]
=
None
,
num_expert_group
:
Optional
[
int
]
=
None
,
custom_routing_function
:
Optional
[
Callable
]
=
None
,
scoring_func
:
str
=
"softmax"
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
**
kwargs
,
):
return
original_awq_moe_method_apply
(
self
,
layer
,
x
,
router_logits
,
top_k
,
renormalize
,
use_grouped_topk
,
topk_group
,
num_expert_group
,
custom_routing_function
,
scoring_func
,
e_score_correction_bias
,
correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
activation
:
str
=
"silu"
,
inplace
:
bool
=
True
,
no_combine
:
bool
=
False
,
):
assert
activation
==
"silu"
assert
inplace
and
not
no_combine
kwargs
=
{
"self"
:
self
,
"layer"
:
layer
,
"x"
:
x
,
"router_logits"
:
router_logits
,
"top_k"
:
top_k
,
"renormalize"
:
renormalize
,
"use_grouped_topk"
:
use_grouped_topk
,
"topk_group"
:
topk_group
,
"num_expert_group"
:
num_expert_group
,
"custom_routing_function"
:
custom_routing_function
,
}
if
correction_bias
is
not
None
:
if
not
has_correction_bias
:
raise
ValueError
(
"Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
)
kwargs
[
"e_score_correction_bias"
]
=
correction_bias
return
original_apply
(
**
kwargs
)
setattr
(
class_obj
,
"apply"
,
new_apply
)
def
patch_vllm_linear_base_isinstance
():
import
builtins
from
vllm.model_executor.layers.linear
import
LinearBase
from
sglang.srt.layers.linear
import
LinearBase
as
PatchedLinearBase
original_isinstance
=
builtins
.
isinstance
def
patched_isinstance
(
obj
,
classinfo
):
if
classinfo
is
LinearBase
:
return
original_isinstance
(
obj
,
PatchedLinearBase
)
return
original_isinstance
(
obj
,
classinfo
)
builtins
.
isinstance
=
patched_isinstance
def
apply_monkey_patches
():
def
monkey_patch_quant_configs
():
"""Apply all monkey patches in one place."""
from
vllm.model_executor.layers.quantization.awq_marlin
import
AWQMoEMethod
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe
import
(
CompressedTensorsW8A8Fp8MoEMethod
,
CompressedTensorsWNA16MoEMethod
,
)
from
vllm.model_executor.layers.quantization.gptq_marlin
import
GPTQMarlinMoEMethod
setattr
(
GPTQMarlinConfig
,
"get_quant_method"
,
gptq_get_quant_method
)
setattr
(
GPTQConfig
,
"get_quant_method"
,
gptq_get_quant_method
)
setattr
(
AWQMarlinConfig
,
"get_quant_method"
,
awq_get_quant_method
)
setattr
(
AWQMoEMethod
,
"apply"
,
awq_moe_method_apply
)
monkey_patch_moe_apply
(
AWQMoEMethod
)
monkey_patch_moe_apply
(
GPTQMarlinMoEMethod
)
monkey_patch_moe_apply
(
CompressedTensorsW8A8Fp8MoEMethod
)
monkey_patch_moe_apply
(
CompressedTensorsWNA16MoEMethod
)
patch_vllm_linear_base_isinstance
()
# Apply patches when module is imported
apply_monkey_patches
()
monkey_patch_quant_configs
()
__all__
=
[
"QuantizationConfig"
,
"get_quantization_config"
,
"QUANTIZATION_METHODS"
,
]
python/sglang/srt/model_executor/model_runner.py
View file @
00d25a7f
...
...
@@ -41,6 +41,7 @@ from sglang.srt.layers.dp_attention import (
initialize_dp_attention
,
)
from
sglang.srt.layers.logits_processor
import
LogitsProcessorOutput
from
sglang.srt.layers.quantization
import
monkey_patch_isinstance_for_vllm_base_layer
from
sglang.srt.layers.sampler
import
Sampler
from
sglang.srt.layers.torchao_utils
import
apply_torchao_config_to_model
from
sglang.srt.lora.lora_manager
import
LoRAManager
...
...
@@ -341,6 +342,8 @@ class ModelRunner:
# Load the model
# Remove monkey_patch when linear.py quant remove dependencies with vllm
monkey_patch_vllm_parallel_state
()
monkey_patch_isinstance_for_vllm_base_layer
()
with
self
.
memory_saver_adapter
.
region
():
self
.
model
=
get_model
(
model_config
=
self
.
model_config
,
...
...
@@ -348,6 +351,7 @@ class ModelRunner:
device_config
=
DeviceConfig
(
self
.
device
),
)
monkey_patch_vllm_parallel_state
(
reverse
=
True
)
monkey_patch_isinstance_for_vllm_base_layer
(
reverse
=
True
)
if
self
.
server_args
.
kv_cache_dtype
==
"fp8_e4m3"
:
if
self
.
server_args
.
quantization_param_path
is
not
None
:
...
...
python/sglang/test/test_utils.py
View file @
00d25a7f
...
...
@@ -36,12 +36,15 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instru
DEFAULT_MLA_MODEL_NAME_FOR_TEST
=
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST
=
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_REASONING_MODEL_NAME_FOR_TEST
=
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST
=
(
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
)
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
1000
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4
,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4
"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
=
"Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_SMALL_VLM_MODEL_NAME
=
"Qwen/Qwen2-VL-2B"
...
...
test/srt/run_suite.py
View file @
00d25a7f
...
...
@@ -22,6 +22,7 @@ suites = {
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_awq.py"
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_eagle_infer.py"
,
447
),
...
...
test/srt/test_awq.py
0 → 100644
View file @
00d25a7f
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestAWQ
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--trust-remote-code"
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
64
,
num_threads
=
32
,
)
metrics
=
run_eval
(
args
)
self
.
assertGreater
(
metrics
[
"score"
],
0.65
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_nightly_gsm8k_eval.py
View file @
00d25a7f
...
...
@@ -38,6 +38,7 @@ MODEL_SCORE_THRESHOLDS = {
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.82
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.84
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.83
,
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
:
0.60
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment