Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6249e4a1
"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "663c6545779b889c7ce51d2aaf8c43309aba6652"
Unverified
Commit
6249e4a1
authored
Jan 13, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 13, 2025
Browse files
Revert "Integration of TurboMind AWQ" (#2866)
parent
f3516c28
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
2 additions
and
411 deletions
+2
-411
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/configs/model_config.py
python/sglang/srt/configs/model_config.py
+1
-9
python/sglang/srt/layers/linear.py
python/sglang/srt/layers/linear.py
+0
-1
python/sglang/srt/layers/quantization/__init__.py
python/sglang/srt/layers/quantization/__init__.py
+0
-2
python/sglang/srt/layers/quantization/awq_turbomind.py
python/sglang/srt/layers/quantization/awq_turbomind.py
+0
-287
python/sglang/srt/layers/quantization/turbomind_utils.py
python/sglang/srt/layers/quantization/turbomind_utils.py
+0
-63
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+0
-1
test/srt/test_turbomind_awq.py
test/srt/test_turbomind_awq.py
+0
-47
No files found.
python/pyproject.toml
View file @
6249e4a1
...
@@ -28,7 +28,7 @@ runtime_common = [
...
@@ -28,7 +28,7 @@ runtime_common = [
srt
=
[
srt
=
[
"sglang[runtime_common]"
,
"cuda-python"
,
"sglang[runtime_common]"
,
"cuda-python"
,
"sgl-kernel>=0.0.2.post11"
,
"torch"
,
"vllm>=0.6.3.post1,<=0.6.4.post1"
,
"sgl-kernel>=0.0.2.post11"
,
"torch"
,
"vllm>=0.6.3.post1,<=0.6.4.post1"
,
"flashinfer==0.1.6"
,
"turbomind"
"flashinfer==0.1.6"
]
]
# HIP (Heterogeneous-computing Interface for Portability) for AMD
# HIP (Heterogeneous-computing Interface for Portability) for AMD
...
...
python/sglang/srt/configs/model_config.py
View file @
6249e4a1
...
@@ -14,7 +14,6 @@
...
@@ -14,7 +14,6 @@
import
json
import
json
import
logging
import
logging
import
sys
from
enum
import
IntEnum
,
auto
from
enum
import
IntEnum
,
auto
from
typing
import
List
,
Optional
,
Set
,
Union
from
typing
import
List
,
Optional
,
Set
,
Union
...
@@ -231,7 +230,7 @@ class ModelConfig:
...
@@ -231,7 +230,7 @@ class ModelConfig:
# Parse quantization method from the HF model config, if available.
# Parse quantization method from the HF model config, if available.
quant_cfg
=
self
.
_parse_quant_hf_config
()
quant_cfg
=
self
.
_parse_quant_hf_config
()
if
quant_cfg
is
not
None
and
not
quantization_in_turbomind
(
self
.
quantization
)
:
if
quant_cfg
is
not
None
:
quant_method
=
quant_cfg
.
get
(
"quant_method"
,
""
).
lower
()
quant_method
=
quant_cfg
.
get
(
"quant_method"
,
""
).
lower
()
# Detect which checkpoint is it
# Detect which checkpoint is it
...
@@ -402,10 +401,3 @@ def is_multimodal_model(model_architectures: List[str]):
...
@@ -402,10 +401,3 @@ def is_multimodal_model(model_architectures: List[str]):
def
is_encoder_decoder_model
(
model_architectures
:
List
[
str
]):
def
is_encoder_decoder_model
(
model_architectures
:
List
[
str
]):
return
"MllamaForConditionalGeneration"
in
model_architectures
return
"MllamaForConditionalGeneration"
in
model_architectures
def
quantization_in_turbomind
(
quantization
:
str
)
->
bool
:
if
quantization
in
[
"awq_turbomind"
]:
return
True
else
:
return
False
python/sglang/srt/layers/linear.py
View file @
6249e4a1
...
@@ -48,7 +48,6 @@ WEIGHT_LOADER_V2_SUPPORTED = [
...
@@ -48,7 +48,6 @@ WEIGHT_LOADER_V2_SUPPORTED = [
"GPTQLinearMethod"
,
"GPTQLinearMethod"
,
"FBGEMMFp8LinearMethod"
,
"FBGEMMFp8LinearMethod"
,
"ModelOptFp8LinearMethod"
,
"ModelOptFp8LinearMethod"
,
"AWQTurbomindLinearMethod"
,
"IPEXAWQLinearMethod"
,
"IPEXAWQLinearMethod"
,
]
]
...
...
python/sglang/srt/layers/quantization/__init__.py
View file @
6249e4a1
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.quantization.marlin import MarlinConfig
...
@@ -20,7 +20,6 @@ from vllm.model_executor.layers.quantization.marlin import MarlinConfig
from
vllm.model_executor.layers.quantization.qqq
import
QQQConfig
from
vllm.model_executor.layers.quantization.qqq
import
QQQConfig
from
vllm.model_executor.layers.quantization.tpu_int8
import
Int8TpuConfig
from
vllm.model_executor.layers.quantization.tpu_int8
import
Int8TpuConfig
from
sglang.srt.layers.quantization.awq_turbomind
import
AWQTurbomindConfig
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.quantization.fp8
import
Fp8Config
from
sglang.srt.layers.quantization.fp8
import
Fp8Config
from
sglang.srt.layers.quantization.modelopt_quant
import
ModelOptFp8Config
from
sglang.srt.layers.quantization.modelopt_quant
import
ModelOptFp8Config
...
@@ -38,7 +37,6 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
...
@@ -38,7 +37,6 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
"gptq_marlin_24"
:
GPTQMarlin24Config
,
"gptq_marlin_24"
:
GPTQMarlin24Config
,
"gptq_marlin"
:
GPTQMarlinConfig
,
"gptq_marlin"
:
GPTQMarlinConfig
,
"awq_marlin"
:
AWQMarlinConfig
,
"awq_marlin"
:
AWQMarlinConfig
,
"awq_turbomind"
:
AWQTurbomindConfig
,
"gptq"
:
GPTQConfig
,
"gptq"
:
GPTQConfig
,
"compressed-tensors"
:
CompressedTensorsConfig
,
"compressed-tensors"
:
CompressedTensorsConfig
,
"bitsandbytes"
:
BitsAndBytesConfig
,
"bitsandbytes"
:
BitsAndBytesConfig
,
...
...
python/sglang/srt/layers/quantization/awq_turbomind.py
deleted
100644 → 0
View file @
f3516c28
import
logging
import
os
import
sys
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
import
turbomind
from
torch.nn
import
Parameter
turbomind_dir
=
os
.
path
.
split
(
turbomind
.
__file__
)[
0
]
sys
.
path
.
append
(
os
.
path
.
join
(
turbomind_dir
,
"lib"
))
import
_turbomind_ext
from
vllm.model_executor.layers.linear
import
LinearBase
from
sglang.srt.layers.linear
import
LinearMethodBase
,
UnquantizedLinearMethod
from
sglang.srt.layers.parameter
import
GroupQuantScaleParameter
,
PackedvLLMParameter
from
sglang.srt.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizeMethodBase
,
)
from
sglang.srt.layers.quantization.turbomind_utils
import
(
get_u4_slices
,
is_layer_skipped_awq
,
pack_u4_row
,
unpack_awq_gemm
,
verify_turbomind_supported
,
)
from
sglang.srt.layers.vocab_parallel_embedding
import
ParallelLMHead
from
sglang.srt.utils
import
is_cuda
,
set_weight_attrs
logger
=
logging
.
getLogger
(
__name__
)
class
AWQTurbomindConfig
(
QuantizationConfig
):
"""Config class for AWQ Turbomind"""
def
__init__
(
self
,
weight_bits
:
int
,
group_size
:
int
,
zero_point
:
bool
,
lm_head_quantized
:
bool
,
modules_to_not_convert
:
Optional
[
List
[
str
]]
=
None
,
)
->
None
:
self
.
pack_factor
=
32
//
weight_bits
# packed into int32
self
.
group_size
=
group_size
self
.
zero_point
=
zero_point
self
.
lm_head_quantized
=
lm_head_quantized
self
.
weight_bits
=
weight_bits
self
.
modules_to_not_convert
=
modules_to_not_convert
or
[]
verify_turbomind_supported
(
self
.
weight_bits
,
self
.
group_size
)
def
__repr__
(
self
)
->
str
:
return
(
f
"AWQTurbomindConfig(weight_bits=
{
self
.
weight_bits
}
, "
f
"group_size=
{
self
.
group_size
}
, "
f
"zero_point=
{
self
.
zero_point
}
, "
f
"lm_head_quantized=
{
self
.
lm_head_quantized
}
, "
f
"modules_to_not_convert=
{
self
.
modules_to_not_convert
}
)"
)
@
classmethod
def
get_name
(
cls
)
->
str
:
return
"awq_turbomind"
@
classmethod
def
get_supported_act_dtypes
(
cls
)
->
List
[
torch
.
dtype
]:
return
[
torch
.
half
,
torch
.
bfloat16
]
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
return
70
@
classmethod
def
get_config_filenames
(
cls
)
->
List
[
str
]:
return
[
"quantize_config.json"
]
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"AWQTurbomindConfig"
:
weight_bits
=
cls
.
get_from_keys
(
config
,
[
"bits"
])
group_size
=
cls
.
get_from_keys
(
config
,
[
"group_size"
])
zero_point
=
cls
.
get_from_keys
(
config
,
[
"zero_point"
])
lm_head_quantized
=
cls
.
get_from_keys_or
(
config
,
[
"lm_head"
],
default
=
False
)
modules_to_not_convert
=
cls
.
get_from_keys_or
(
config
,
[
"modules_to_not_convert"
],
None
)
return
cls
(
weight_bits
,
group_size
,
zero_point
,
lm_head_quantized
,
modules_to_not_convert
,
)
@
classmethod
def
override_quantization_method
(
cls
,
hf_quant_cfg
,
user_quant
)
->
Optional
[
str
]:
can_convert
=
cls
.
is_awq_turbomind_compatible
(
hf_quant_cfg
)
is_valid_user_quant
=
user_quant
is
None
or
user_quant
==
"awq_turbomind"
if
can_convert
and
is_valid_user_quant
:
msg
=
f
"The model is convertible to
{
cls
.
get_name
()
}
during runtime. Using
{
cls
.
get_name
()
}
kernel."
logger
.
info
(
msg
)
return
cls
.
get_name
()
if
can_convert
and
user_quant
==
"awq"
:
logger
.
info
(
"Detected that the model can run with awq_turbomind"
", however you specified quantization=awq explicitly,"
" so forcing awq. Use quantization=awq_turbomind for"
" faster inference"
)
return
None
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
if
isinstance
(
layer
,
LinearBase
)
or
(
isinstance
(
layer
,
ParallelLMHead
)
and
self
.
lm_head_quantized
):
if
is_layer_skipped_awq
(
prefix
,
self
.
modules_to_not_convert
):
return
UnquantizedLinearMethod
()
return
AWQTurbomindLinearMethod
(
self
)
return
None
@
classmethod
def
is_awq_turbomind_compatible
(
cls
,
quant_config
:
Dict
[
str
,
Any
]):
if
not
is_cuda
():
return
False
# Extract data from quant config.
quant_method
=
quant_config
.
get
(
"quant_method"
,
""
).
lower
()
num_bits
=
quant_config
.
get
(
"bits"
)
group_size
=
quant_config
.
get
(
"group_size"
)
zero_point
=
quant_config
.
get
(
"zero_point"
)
if
quant_method
!=
"awq"
:
return
False
# If we cannot find the info needed in the config, cannot convert.
if
num_bits
is
None
or
group_size
is
None
or
zero_point
is
None
:
return
False
return
verify_turbomind_supported
(
quant_bit
=
num_bits
,
group_size
=
group_size
)
def
get_scaled_act_names
(
self
)
->
List
[
str
]:
return
[]
class
AWQTurbomindLinearMethod
(
LinearMethodBase
):
"""Linear method for AWQ Turbomind.
Args:
quant_config: The AWQ Turbomind quantization config.
"""
def
__init__
(
self
,
quant_config
:
AWQTurbomindConfig
)
->
None
:
self
.
quant_config
=
quant_config
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
output_partition_sizes
:
List
[
int
],
input_size
:
int
,
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
)
->
None
:
output_size_per_partition
=
sum
(
output_partition_sizes
)
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
# Normalize group_size
if
self
.
quant_config
.
group_size
!=
-
1
:
group_size
=
self
.
quant_config
.
group_size
else
:
group_size
=
input_size
qweight
=
PackedvLLMParameter
(
data
=
torch
.
empty
(
input_size_per_partition
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
),
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
weight_loader
=
weight_loader
,
)
num_groups
=
input_size_per_partition
//
group_size
qzeros
=
PackedvLLMParameter
(
data
=
torch
.
empty
(
num_groups
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
),
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
weight_loader
=
weight_loader
,
)
scales
=
GroupQuantScaleParameter
(
data
=
torch
.
empty
(
num_groups
,
output_size_per_partition
,
dtype
=
params_dtype
,
),
input_dim
=
0
,
output_dim
=
1
,
weight_loader
=
weight_loader
,
)
layer
.
register_parameter
(
"qweight"
,
qweight
)
layer
.
register_parameter
(
"qzeros"
,
qzeros
)
layer
.
register_parameter
(
"scales"
,
scales
)
layer
.
input_size_per_partition
=
input_size_per_partition
layer
.
output_size_per_partition
=
output_size_per_partition
layer
.
num_groups
=
num_groups
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
qweight_turbomind
=
unpack_awq_gemm
(
layer
.
qweight
.
data
)
qzeros_turbomind
=
unpack_awq_gemm
(
layer
.
qzeros
.
data
)
scales_turbomind
=
layer
.
scales
.
data
qweight_turbomind
=
pack_u4_row
(
qweight_turbomind
)
qzeros_turbomind
=
qzeros_turbomind
.
to
(
torch
.
half
)
device_id
=
layer
.
qweight
.
device
.
index
properties
=
torch
.
cuda
.
get_device_properties
(
device_id
)
def
is_16xx_series
(
name
):
import
re
pattern
=
r
"GTX 16\d\d"
return
bool
(
re
.
search
(
pattern
,
name
))
simt
=
is_16xx_series
(
properties
.
name
)
qweight_turbomind
=
qweight_turbomind
.
contiguous
()
scales_turbomind
=
scales_turbomind
.
contiguous
()
qzeros_turbomind
=
qzeros_turbomind
.
contiguous
()
self
.
linear
=
_turbomind_ext
.
Linear
(
layer
.
input_size_per_partition
,
layer
.
output_size_per_partition
,
self
.
quant_config
.
weight_bits
,
self
.
quant_config
.
group_size
,
)
self
.
linear
.
post_init
(
qweight_turbomind
,
scales_turbomind
,
qzeros_turbomind
,
simt
)
layer
.
qweight
=
Parameter
(
qweight_turbomind
,
requires_grad
=
False
)
layer
.
scales
=
Parameter
(
scales_turbomind
,
requires_grad
=
False
)
layer
.
qzeros
=
Parameter
(
qzeros_turbomind
,
requires_grad
=
False
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
x
=
x
.
view
(
-
1
,
x
.
shape
[
-
1
])
out_shape
=
x
.
shape
[:
-
1
]
+
(
layer
.
output_size_per_partition
,)
out
=
torch
.
empty
(
(
x
.
shape
[
0
],
layer
.
output_size_per_partition
),
dtype
=
torch
.
float16
,
device
=
x
.
device
,
)
stream
=
torch
.
cuda
.
current_stream
()
self
.
linear
.
forward
(
x
,
out
,
stream
.
cuda_stream
)
out
=
torch
.
from_dlpack
(
out
)
if
bias
is
not
None
:
out
.
add_
(
bias
)
return
out
.
view
(
out_shape
)
python/sglang/srt/layers/quantization/turbomind_utils.py
deleted
100644 → 0
View file @
f3516c28
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
List
import
torch
from
sglang.srt.utils
import
get_device_capability
def
get_u4_slices
(
x
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
)
->
List
[
torch
.
Tensor
]:
assert
x
.
dtype
==
torch
.
int32
xs
=
[]
for
_
in
range
(
8
):
xs
.
append
((
x
&
15
).
to
(
dtype
))
x
=
x
>>
4
return
xs
def
unpack_awq_gemm
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
The int4 weights are packed into int32:
bit: 31-28 27-24 23-20 19-16 15-12 11-8 7-4 3-0
weight: int4_1 int4_2 int4_3 int4_4 int4_5 int4_6 int4_7 int4_8
"""
xs
=
get_u4_slices
(
x
,
torch
.
uint8
)
order
=
[
0
,
4
,
1
,
5
,
2
,
6
,
3
,
7
]
ys
=
[
xs
[
i
]
for
i
in
order
]
return
torch
.
stack
(
ys
,
dim
=-
1
).
view
(
*
x
.
shape
[:
-
1
],
-
1
)
def
pack_u4_row
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
assert
x
.
dtype
==
torch
.
uint8
xs
=
x
.
view
(
*
x
.
shape
[:
-
1
],
-
1
,
8
).
split
(
1
,
dim
=-
1
)
a
=
torch
.
zeros
(
xs
[
0
].
shape
,
dtype
=
torch
.
int32
,
device
=
x
.
device
)
for
t
in
reversed
(
xs
):
a
=
(
a
<<
4
)
|
t
return
a
.
squeeze
(
dim
=-
1
)
def
verify_turbomind_supported
(
quant_bit
:
int
,
group_size
:
int
)
->
bool
:
if
quant_bit
not
in
[
4
]:
raise
NotImplementedError
(
f
"[Tubomind] Only 4-bit is supported for now, but got
{
quant_bit
}
bit"
)
if
group_size
!=
128
:
raise
NotImplementedError
(
f
"[Tubomind] Only group_size 128 is supported for now, "
f
"but got group_size
{
group_size
}
"
)
major
,
minor
=
get_device_capability
()
capability
=
major
*
10
+
minor
if
capability
<
70
:
raise
NotImplementedError
(
f
"[Tubomind] Only capability >= 70 is supported for now, but got
{
capability
}
"
)
return
True
def
is_layer_skipped_awq
(
prefix
:
str
,
modules_to_not_convert
:
List
[
str
]):
return
any
(
module_name
in
prefix
for
module_name
in
modules_to_not_convert
)
python/sglang/srt/server_args.py
View file @
6249e4a1
...
@@ -375,7 +375,6 @@ class ServerArgs:
...
@@ -375,7 +375,6 @@ class ServerArgs:
"marlin"
,
"marlin"
,
"gptq_marlin"
,
"gptq_marlin"
,
"awq_marlin"
,
"awq_marlin"
,
"awq_turbomind"
,
"bitsandbytes"
,
"bitsandbytes"
,
"gguf"
,
"gguf"
,
"modelopt"
,
"modelopt"
,
...
...
test/srt/test_turbomind_awq.py
deleted
100644 → 0
View file @
f3516c28
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestMLA
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--quantization"
,
"awq_turbomind"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.5
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment