Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
71 additions
and
32 deletions
+71
-32
vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
...xecutor/layers/quantization/kernels/scaled_mm/__init__.py
+2
-0
vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
...executor/layers/quantization/kernels/scaled_mm/cutlass.py
+2
-0
vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
..._executor/layers/quantization/kernels/scaled_mm/triton.py
+2
-0
vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
...del_executor/layers/quantization/kernels/scaled_mm/xla.py
+2
-0
vllm/model_executor/layers/quantization/kv_cache.py
vllm/model_executor/layers/quantization/kv_cache.py
+2
-0
vllm/model_executor/layers/quantization/marlin.py
vllm/model_executor/layers/quantization/marlin.py
+2
-0
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/modelopt.py
+2
-0
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/moe_wna16.py
+20
-19
vllm/model_executor/layers/quantization/neuron_quant.py
vllm/model_executor/layers/quantization/neuron_quant.py
+2
-0
vllm/model_executor/layers/quantization/qqq.py
vllm/model_executor/layers/quantization/qqq.py
+2
-0
vllm/model_executor/layers/quantization/quark/quark.py
vllm/model_executor/layers/quantization/quark/quark.py
+7
-5
vllm/model_executor/layers/quantization/quark/quark_moe.py
vllm/model_executor/layers/quantization/quark/quark_moe.py
+2
-0
vllm/model_executor/layers/quantization/quark/schemes/__init__.py
...el_executor/layers/quantization/quark/schemes/__init__.py
+2
-0
vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
...xecutor/layers/quantization/quark/schemes/quark_scheme.py
+2
-0
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
...cutor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+2
-0
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
...utor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+2
-0
vllm/model_executor/layers/quantization/quark/utils.py
vllm/model_executor/layers/quantization/quark/utils.py
+11
-8
vllm/model_executor/layers/quantization/schema.py
vllm/model_executor/layers/quantization/schema.py
+1
-0
vllm/model_executor/layers/quantization/tpu_int8.py
vllm/model_executor/layers/quantization/tpu_int8.py
+2
-0
vllm/model_executor/layers/quantization/utils/__init__.py
vllm/model_executor/layers/quantization/utils/__init__.py
+2
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
from
typing
import
Dict
,
List
,
Optional
,
Type
from
typing
import
Dict
,
List
,
Optional
,
Type
...
...
vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
warnings
import
warnings
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
...
...
vllm/model_executor/layers/quantization/kv_cache.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
torch
import
torch
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
...
vllm/model_executor/layers/quantization/marlin.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/modelopt.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/moe_wna16.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
import
torch
import
torch
...
@@ -5,17 +7,15 @@ import torch
...
@@ -5,17 +7,15 @@ import torch
from
vllm.distributed
import
get_tensor_model_parallel_rank
,
get_tp_group
from
vllm.distributed
import
get_tensor_model_parallel_rank
,
get_tp_group
from
vllm.model_executor.layers.fused_moe.layer
import
(
from
vllm.model_executor.layers.fused_moe.layer
import
(
FusedMoE
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
)
FusedMoE
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
)
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
from
vllm.model_executor.layers.quantization.awq
import
(
AWQConfig
,
UnquantizedLinearMethod
)
AWQLinearMethod
)
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
from
vllm.model_executor.layers.quantization.awq_marlin
import
AWQMarlinConfig
AWQMarlinConfig
,
AWQMarlinLinearMethod
)
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizeMethodBase
)
QuantizationConfig
,
QuantizeMethodBase
)
from
vllm.model_executor.layers.quantization.gptq
import
(
GPTQConfig
,
from
vllm.model_executor.layers.quantization.gptq
import
GPTQConfig
GPTQLinearMethod
)
from
vllm.model_executor.layers.quantization.gptq_marlin
import
(
from
vllm.model_executor.layers.quantization.gptq_marlin
import
(
GPTQMarlinConfig
,
GPTQMarlinLinearMethod
)
GPTQMarlinConfig
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -126,25 +126,26 @@ class MoeWNA16Config(QuantizationConfig):
...
@@ -126,25 +126,26 @@ class MoeWNA16Config(QuantizationConfig):
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
if
is_layer_skipped_quant
(
prefix
,
self
.
modules_to_not_convert
):
if
is_layer_skipped_quant
(
prefix
,
self
.
modules_to_not_convert
):
return
UnquantizedLinearMethod
()
return
UnquantizedLinearMethod
()
elif
isinstance
(
layer
,
FusedMoE
):
elif
isinstance
(
layer
,
LinearBase
):
return
MoeWNA16Method
(
self
)
else
:
if
self
.
linear_quant_method
==
"gptq"
:
if
self
.
linear_quant_method
==
"gptq"
:
if
self
.
use_marlin
:
if
self
.
use_marlin
:
return
GPTQMarlin
LinearMethod
(
return
GPTQMarlin
Config
.
from_config
(
GPTQMarlinConfig
.
from_config
(
self
.
full_config
)
)
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
else
:
else
:
return
GPTQ
LinearMethod
(
return
GPTQ
Config
.
from_config
(
GPTQConfig
.
from_config
(
self
.
full_config
)
)
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
elif
self
.
linear_quant_method
==
"awq"
:
elif
self
.
linear_quant_method
==
"awq"
:
if
self
.
use_marlin
:
if
self
.
use_marlin
:
return
AWQMarlin
LinearMethod
(
return
AWQMarlin
Config
.
from_config
(
AWQMarlinConfig
.
from_config
(
self
.
full_config
)
)
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
else
:
else
:
return
AWQ
LinearMethod
(
return
AWQ
Config
.
from_config
(
AWQConfig
.
from_config
(
self
.
full_config
)
)
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
else
:
else
:
raise
ValueError
(
"moe_wna16 only support gptq and awq."
)
raise
ValueError
(
"moe_wna16 only support gptq and awq."
)
elif
isinstance
(
layer
,
FusedMoE
):
return
MoeWNA16Method
(
self
)
return
None
def
is_layer_skipped_quant
(
prefix
:
str
,
modules_to_not_convert
:
List
[
str
]):
def
is_layer_skipped_quant
(
prefix
:
str
,
modules_to_not_convert
:
List
[
str
]):
...
...
vllm/model_executor/layers/quantization/neuron_quant.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
from
importlib.util
import
find_spec
from
importlib.util
import
find_spec
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Dict
,
List
,
Optional
...
...
vllm/model_executor/layers/quantization/qqq.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/quark/quark.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
fnmatch
import
fnmatch
import
re
import
re
from
typing
import
Any
,
Dict
,
List
,
Optional
,
cast
from
typing
import
Any
,
Dict
,
List
,
Optional
,
cast
...
@@ -16,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import (
...
@@ -16,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import (
QuarkScheme
,
QuarkW8A8Fp8
,
QuarkW8A8Int8
)
QuarkScheme
,
QuarkW8A8Fp8
,
QuarkW8A8Int8
)
from
vllm.model_executor.layers.quantization.quark.utils
import
(
from
vllm.model_executor.layers.quantization.quark.utils
import
(
deep_compare
,
should_ignore_layer
)
deep_compare
,
should_ignore_layer
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
FUSED_LAYER_NAME_MAPPING
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
__all__
=
[
"QuarkLinearMethod"
]
__all__
=
[
"QuarkLinearMethod"
]
...
@@ -56,7 +56,9 @@ class QuarkConfig(QuantizationConfig):
...
@@ -56,7 +56,9 @@ class QuarkConfig(QuantizationConfig):
# Check if the layer is skipped for quantization.
# Check if the layer is skipped for quantization.
exclude_layers
=
cast
(
List
[
str
],
self
.
quant_config
.
get
(
"exclude"
))
exclude_layers
=
cast
(
List
[
str
],
self
.
quant_config
.
get
(
"exclude"
))
if
should_ignore_layer
(
prefix
,
ignore
=
exclude_layers
):
if
should_ignore_layer
(
prefix
,
ignore
=
exclude_layers
,
fused_mapping
=
self
.
packed_modules_mapping
):
return
UnquantizedLinearMethod
()
return
UnquantizedLinearMethod
()
if
isinstance
(
layer
,
LinearBase
):
if
isinstance
(
layer
,
LinearBase
):
scheme
=
self
.
get_scheme
(
layer
=
layer
,
layer_name
=
prefix
)
scheme
=
self
.
get_scheme
(
layer
=
layer
,
layer_name
=
prefix
)
...
@@ -199,8 +201,8 @@ class QuarkConfig(QuantizationConfig):
...
@@ -199,8 +201,8 @@ class QuarkConfig(QuantizationConfig):
module
:
torch
.
nn
.
Module
)
->
Dict
[
str
,
Any
]:
module
:
torch
.
nn
.
Module
)
->
Dict
[
str
,
Any
]:
proj_name
=
layer_name
.
split
(
"."
)[
-
1
]
proj_name
=
layer_name
.
split
(
"."
)[
-
1
]
if
proj_name
in
FUSED_LAYER_NAME_MAPPING
:
if
proj_name
in
self
.
packed_modules_mapping
:
shard_proj_names
=
FUSED_LAYER_NAME_MAPPING
[
proj_name
]
shard_proj_names
=
self
.
packed_modules_mapping
[
proj_name
]
# Convert fused_name --> [shard_names]
# Convert fused_name --> [shard_names]
shard_names
=
[
shard_names
=
[
...
...
vllm/model_executor/layers/quantization/quark/quark_moe.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Callable
,
Dict
,
Optional
from
typing
import
Any
,
Callable
,
Dict
,
Optional
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/quark/schemes/__init__.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
.quark_scheme
import
QuarkScheme
from
.quark_scheme
import
QuarkScheme
from
.quark_w8a8_fp8
import
QuarkW8A8Fp8
from
.quark_w8a8_fp8
import
QuarkW8A8Fp8
from
.quark_w8a8_int8
import
QuarkW8A8Int8
from
.quark_w8a8_int8
import
QuarkW8A8Int8
...
...
vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
typing
import
Optional
from
typing
import
Optional
...
...
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Callable
,
List
,
Optional
from
typing
import
Callable
,
List
,
Optional
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Callable
,
List
,
Optional
,
Set
from
typing
import
Callable
,
List
,
Optional
,
Set
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/quark/utils.py
View file @
66b809cc
import
re
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Iterable
,
Optional
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
import
re
FUSED_LAYER_NAME_MAPPING
)
from
types
import
MappingProxyType
from
typing
import
Any
,
Iterable
,
List
,
Mapping
,
Optional
def
deep_compare
(
dict1
:
Any
,
dict2
:
Any
)
->
bool
:
def
deep_compare
(
dict1
:
Any
,
dict2
:
Any
)
->
bool
:
...
@@ -18,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
...
@@ -18,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
return
dict1
==
dict2
return
dict1
==
dict2
def
should_ignore_layer
(
layer_name
:
Optional
[
str
],
def
should_ignore_layer
(
ignore
:
Iterable
[
str
])
->
bool
:
layer_name
:
Optional
[
str
],
ignore
:
Iterable
[
str
],
fused_mapping
:
Mapping
[
str
,
List
[
str
]]
=
MappingProxyType
({})
)
->
bool
:
if
layer_name
is
None
:
if
layer_name
is
None
:
return
False
return
False
...
@@ -31,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str],
...
@@ -31,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str],
# in the safetensors checkpoint. So, we convert the name
# in the safetensors checkpoint. So, we convert the name
# from the fused version to unfused + check to make sure that
# from the fused version to unfused + check to make sure that
# each shard of the fused layer has the same scheme.
# each shard of the fused layer has the same scheme.
if
proj_name
in
FUSED_LAYER_NAME_MAPPING
:
if
proj_name
in
fused_mapping
:
shard_proj_names
=
FUSED_LAYER_NAME_MAPPING
[
proj_name
]
shard_proj_names
=
fused_mapping
[
proj_name
]
# Convert fused_name --> [shard_names]
# Convert fused_name --> [shard_names]
shard_names
=
[
shard_names
=
[
...
...
vllm/model_executor/layers/quantization/schema.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""
"""
This file contains the Pydantic schemas for various quantization-related
This file contains the Pydantic schemas for various quantization-related
parameters. When a relevant quantization technique is specified, these
parameters. When a relevant quantization technique is specified, these
...
...
vllm/model_executor/layers/quantization/tpu_int8.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch
...
...
vllm/model_executor/layers/quantization/utils/__init__.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
.layer_utils
import
replace_parameter
,
update_tensor_inplace
from
.layer_utils
import
replace_parameter
,
update_tensor_inplace
__all__
=
[
'update_tensor_inplace'
,
'replace_parameter'
]
__all__
=
[
'update_tensor_inplace'
,
'replace_parameter'
]
Prev
1
…
34
35
36
37
38
39
40
41
42
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment