Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6781a21e
Commit
6781a21e
authored
Jul 03, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'origin/0.9.1-dev-w8a8' into v0.9.1-dev
parents
f9795c8c
c0c0eb69
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
111 additions
and
42 deletions
+111
-42
vllm/model_executor/layers/quantization/blockwise_int8.py
vllm/model_executor/layers/quantization/blockwise_int8.py
+23
-6
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+3
-3
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+4
-0
vllm/utils.py
vllm/utils.py
+81
-33
No files found.
vllm/model_executor/layers/quantization/blockwise_int8.py
View file @
6781a21e
...
@@ -231,8 +231,8 @@ class BlockInt8LinearMethod(LinearMethodBase):
...
@@ -231,8 +231,8 @@ class BlockInt8LinearMethod(LinearMethodBase):
n
=
layer
.
weight
.
shape
[
0
]
n
=
layer
.
weight
.
shape
[
0
]
k
=
layer
.
weight
.
shape
[
1
]
k
=
layer
.
weight
.
shape
[
1
]
if
{
n
,
k
}
not
in
self
.
tritonsingleton
.
weight_shapes
:
if
[
n
,
k
]
not
in
self
.
tritonsingleton
.
weight_shapes
:
self
.
tritonsingleton
.
weight_shapes
.
append
(
{
n
,
k
}
)
self
.
tritonsingleton
.
weight_shapes
.
append
(
[
n
,
k
]
)
json_file
=
self
.
tritonsingleton
.
get_blockint8json_name
(
n
,
k
,
self
.
block_size
[
0
],
self
.
block_size
[
1
])
json_file
=
self
.
tritonsingleton
.
get_blockint8json_name
(
n
,
k
,
self
.
block_size
[
0
],
self
.
block_size
[
1
])
configs_dict
=
self
.
tritonsingleton
.
get_blockint8_triton_cache
(
json_file
,
n
,
k
,
self
.
block_size
[
0
],
self
.
block_size
[
1
])
configs_dict
=
self
.
tritonsingleton
.
get_blockint8_triton_cache
(
json_file
,
n
,
k
,
self
.
block_size
[
0
],
self
.
block_size
[
1
])
...
@@ -260,7 +260,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
...
@@ -260,7 +260,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
K
=
x
.
shape
[
1
]
K
=
x
.
shape
[
1
]
N
=
layer
.
weight
.
shape
[
0
]
N
=
layer
.
weight
.
shape
[
0
]
#print("self.tritonsingleton.triton_json_dict:",self.tritonsingleton.triton_json_dict)
#Get the best config options
#Get the best config options
if
len
(
self
.
tritonsingleton
.
triton_json_dict
)
==
0
:
if
len
(
self
.
tritonsingleton
.
triton_json_dict
)
==
0
:
config
=
None
config
=
None
...
@@ -292,9 +291,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
...
@@ -292,9 +291,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
else
:
else
:
config
=
None
config
=
None
#print("m:{},n:{},k:{},config:{}".format(M,N,K,config))
return
apply_w8a8_block_int8_linear
(
return
apply_w8a8_block_int8_linear
(
input
=
x
,
input
=
x
,
weight
=
layer
.
weight
,
weight
=
layer
.
weight
,
...
@@ -431,6 +428,26 @@ class BlockInt8MoEMethod:
...
@@ -431,6 +428,26 @@ class BlockInt8MoEMethod:
def
process_weights_after_loading
(
self
,
layer
:
Module
)
->
None
:
def
process_weights_after_loading
(
self
,
layer
:
Module
)
->
None
:
# Block quant doesn't need to process weights after loading
# Block quant doesn't need to process weights after loading
# warmup and get moe block-int8 config
# warmup and get moe block-int8 config
E
=
layer
.
w13_weight
.
shape
[
0
]
N1
=
layer
.
w13_weight
.
shape
[
1
]
N2
=
layer
.
w2_weight
.
shape
[
1
]
K
=
layer
.
w2_weight
.
shape
[
2
]
if
[
E
,
N1
,
N2
,
K
]
not
in
self
.
tritonsingleton
.
moe_weight_shapes
:
self
.
tritonsingleton
.
moe_weight_shapes
.
append
([
E
,
N1
,
N2
,
K
])
TOPK
=
self
.
tritonsingleton
.
topk
block_size
=
self
.
quant_config
.
weight_block_size
json_file
=
self
.
tritonsingleton
.
get_moeblockint8json_name
(
block_size
,
E
,
N1
,
N2
,
K
,
TOPK
)
configs_dict
=
self
.
tritonsingleton
.
get_moeblockint8_triton_cache
(
json_file
,
block_size
,
E
,
N1
,
N2
,
K
,
TOPK
)
#warmup
if
configs_dict
:
self
.
tritonsingleton
.
triton_moejson_dict
.
update
(
configs_dict
)
#生成模型配置文件
self
.
tritonsingleton
.
gen_model_json
(
block_size
)
return
return
def
apply
(
def
apply
(
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
6781a21e
...
@@ -597,8 +597,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -597,8 +597,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
k
=
layer
.
weight
.
shape
[
1
]
k
=
layer
.
weight
.
shape
[
1
]
if
self
.
w8a8_strategy
==
1
:
if
self
.
w8a8_strategy
==
1
:
if
{
n
,
k
}
not
in
self
.
tritonsingleton
.
weight_shapes
:
if
[
n
,
k
]
not
in
self
.
tritonsingleton
.
weight_shapes
:
self
.
tritonsingleton
.
weight_shapes
.
append
(
{
n
,
k
}
)
self
.
tritonsingleton
.
weight_shapes
.
append
(
[
n
,
k
]
)
json_file
=
self
.
tritonsingleton
.
get_w8a8json_name
(
n
,
k
)
json_file
=
self
.
tritonsingleton
.
get_w8a8json_name
(
n
,
k
)
configs_dict
=
self
.
tritonsingleton
.
get_triton_cache
(
json_file
,
n
,
k
)
configs_dict
=
self
.
tritonsingleton
.
get_triton_cache
(
json_file
,
n
,
k
)
...
@@ -607,7 +607,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -607,7 +607,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
for
key
,
value
in
configs_dict
.
items
():
for
key
,
value
in
configs_dict
.
items
():
m
=
int
(
key
.
split
(
'_'
)[
0
])
m
=
int
(
key
.
split
(
'_'
)[
0
])
ops
.
triton_int8_gemm_helper
(
m
=
m
,
n
=
n
,
k
=
k
,
per_token_act_quant
=
True
,
per_out_channel_weight_quant
=
True
,
use_bias
=
False
,
best_config
=
value
)
ops
.
triton_int8_gemm_helper
(
m
=
m
,
n
=
n
,
k
=
k
,
per_token_act_quant
=
True
,
per_out_channel_weight_quant
=
True
,
use_bias
=
False
,
device
=
layer
.
weight
.
device
,
best_config
=
value
)
else
:
else
:
weight_data
=
layer
.
weight
.
data
weight_data
=
layer
.
weight
.
data
_weight
=
weight_data
.
T
.
contiguous
().
reshape
(
n
,
-
1
)
_weight
=
weight_data
.
T
.
contiguous
().
reshape
(
n
,
-
1
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
6781a21e
...
@@ -60,6 +60,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
...
@@ -60,6 +60,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory
,
make_layers
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
maybe_prefix
)
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
W8a8GetCacheJSON
class
DeepseekV2MLP
(
nn
.
Module
):
class
DeepseekV2MLP
(
nn
.
Module
):
...
@@ -727,6 +728,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
...
@@ -727,6 +728,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
self
.
model
.
make_empty_intermediate_tensors
)
self
.
model
.
make_empty_intermediate_tensors
)
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
tritonsingleton
.
topk
=
config
.
num_experts_per_tok
self
.
tritonsingleton
.
quant_method
=
self
.
quant_method
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
return
self
.
model
.
get_input_embeddings
(
input_ids
)
...
...
vllm/utils.py
View file @
6781a21e
...
@@ -1872,7 +1872,6 @@ class AtomicCounter:
...
@@ -1872,7 +1872,6 @@ class AtomicCounter:
def
value
(
self
):
def
value
(
self
):
return
self
.
_value
return
self
.
_value
class
W8a8GetCacheJSON
:
class
W8a8GetCacheJSON
:
_instance
=
None
_instance
=
None
...
@@ -1883,14 +1882,69 @@ class W8a8GetCacheJSON:
...
@@ -1883,14 +1882,69 @@ class W8a8GetCacheJSON:
return
cls
.
_instance
return
cls
.
_instance
def
_initialize
(
self
):
def
_initialize
(
self
):
from
vllm.platforms
import
current_platform
current_folder_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
current_folder_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
json_folder_path
=
current_folder_path
+
'/../lmslim/configs/w8a8'
json_folder_path
=
current_folder_path
+
'/../lmslim/configs/w8a8'
self
.
triton_json_dir
=
(
os
.
getenv
(
'TRITON_JSON_DIR'
,
json_folder_path
))
self
.
triton_json_dir
=
(
os
.
getenv
(
'TRITON_JSON_DIR'
,
json_folder_path
))
self
.
triton_json_dict
=
{}
self
.
triton_json_dict
=
{}
self
.
triton_moejson_dict
=
{}
self
.
triton_json_list
=
[]
self
.
triton_json_list
=
[]
self
.
weight_shapes
=
[]
self
.
weight_shapes
=
[]
self
.
moe_weight_shapes
=
[]
device_name
=
current_platform
.
get_device_name
().
replace
(
" "
,
"_"
)
if
'K100_AI'
in
device_name
and
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
multi_processor_count
==
120
:
device_name
=
'K100_AI_120'
self
.
device_name
=
device_name
self
.
topk
=
1
self
.
quant_method
=
None
#析构函数,最后会生成model.json的配置文件
def
gen_model_json
(
self
,
E
:
int
,
block_size
:
Optional
[
list
]
=
None
):
json_dir
=
os
.
getenv
(
'LMSLIM_TUNING_JSON'
,
"None"
)
if
json_dir
is
not
"None"
and
os
.
path
.
exists
(
json_dir
):
#生成模型配置文件
logger
.
info
(
"model_tuning.json is at LMSLIM_TUNING_JSON:%s"
,
json_dir
)
config
=
{
"layers"
:
{
"linear"
:
{
"shapes"
:
[],
"m_range"
:
"None"
,
},
"moe"
:
{
"shapes"
:
[],
"m_range"
:
"None"
,
"topk"
:
self
.
topk
}
},
"quantization_config"
:
{
"quant_method"
:
self
.
quant_method
,
"weight_block_size"
:
"None"
}
}
# 处理 MoE shapes
for
shape
in
self
.
moe_weight_shapes
:
if
len
(
shape
)
==
4
:
# 假设 MoE shape 是 [N1, N2,K] 格式
moe_config
=
{
"E"
:
shape
[
0
],
"N1"
:
shape
[
1
],
"N2"
:
shape
[
2
],
"K"
:
shape
[
3
],
# 默认值
}
config
[
"layers"
][
"moe"
][
"shapes"
].
append
(
moe_config
)
for
shape
in
self
.
weight_shapes
:
config
[
"layers"
][
"linear"
][
"shapes"
].
append
(
shape
)
if
block_size
is
not
None
:
config
[
"quantization_config"
][
"weight_block_size"
]
=
block_size
with
open
(
json_dir
+
"/model.json"
,
'w'
)
as
f
:
json
.
dump
(
config
,
f
,
indent
=
4
)
else
:
logger
.
info
(
"LMSLIM_TUNING_JSON is not set"
)
def
getspec_config
(
self
,
configs_dict
,
M
,
N
,
K
):
def
getspec_config
(
self
,
configs_dict
,
M
,
N
,
K
):
if
f
"
{
M
}
_
{
N
}
_
{
K
}
"
in
configs_dict
:
if
f
"
{
M
}
_
{
N
}
_
{
K
}
"
in
configs_dict
:
return
configs_dict
[
f
"
{
M
}
_
{
N
}
_
{
K
}
"
]
return
configs_dict
[
f
"
{
M
}
_
{
N
}
_
{
K
}
"
]
...
@@ -1913,24 +1967,11 @@ class W8a8GetCacheJSON:
...
@@ -1913,24 +1967,11 @@ class W8a8GetCacheJSON:
for
key
,
value
in
cachedata
.
items
():
for
key
,
value
in
cachedata
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_value
=
{
configs_dict
[
configs_key
]
=
sub_value
'SPLIT_K'
:
int
(
sub_value
[
"SPLIT_K"
]),
'BLOCK_SIZE_M'
:
int
(
sub_value
[
"BLOCK_SIZE_M"
]),
'BLOCK_SIZE_N'
:
int
(
sub_value
[
"BLOCK_SIZE_N"
]),
'BLOCK_SIZE_K'
:
int
(
sub_value
[
"BLOCK_SIZE_K"
]),
'GROUP_SIZE_M'
:
int
(
sub_value
[
"GROUP_SIZE_M"
]),
'num_stages'
:
int
(
sub_value
[
'num_stages'
]),
'num_warps'
:
int
(
sub_value
[
'num_warps'
])
}
configs_dict
[
configs_key
]
=
configs_value
return
configs_dict
return
configs_dict
def
get_w8a8json_name
(
self
,
n
,
k
):
def
get_w8a8json_name
(
self
,
n
,
k
):
from
vllm.platforms
import
current_platform
return
self
.
triton_json_dir
+
f
"/W8A8_
{
n
}
_
{
k
}
_
{
self
.
device_name
}
.json"
device_name
=
current_platform
.
get_device_name
().
replace
(
" "
,
"_"
)
if
'K100_AI'
in
device_name
and
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
multi_processor_count
==
120
:
device_name
=
'K100_AI_120'
return
self
.
triton_json_dir
+
f
"/W8A8_
{
n
}
_
{
k
}
_
{
device_name
}
.json"
def
get_blockint8_triton_cache
(
self
,
file_path
,
n
,
k
,
block_n
,
block_k
):
def
get_blockint8_triton_cache
(
self
,
file_path
,
n
,
k
,
block_n
,
block_k
):
cache_json_file
=
file_path
cache_json_file
=
file_path
...
@@ -1947,27 +1988,34 @@ class W8a8GetCacheJSON:
...
@@ -1947,27 +1988,34 @@ class W8a8GetCacheJSON:
for
key
,
value
in
cachedata
.
items
():
for
key
,
value
in
cachedata
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_value
=
{
configs_dict
[
configs_key
]
=
sub_value
'BLOCK_SIZE_M'
:
int
(
sub_value
[
"BLOCK_SIZE_M"
]),
'BLOCK_SIZE_N'
:
int
(
sub_value
[
"BLOCK_SIZE_N"
]),
'BLOCK_SIZE_K'
:
int
(
sub_value
[
"BLOCK_SIZE_K"
]),
'GROUP_SIZE_M'
:
int
(
sub_value
[
"GROUP_SIZE_M"
]),
'kpack'
:
int
(
sub_value
[
"kpack"
]),
'num_stages'
:
int
(
sub_value
[
'num_stages'
]),
'num_warps'
:
int
(
sub_value
[
'num_warps'
]),
'enable_mmacfuse'
:
int
(
sub_value
[
'enable_mmacfuse'
]),
}
configs_dict
[
configs_key
]
=
configs_value
return
configs_dict
return
configs_dict
def
get_blockint8json_name
(
self
,
n
,
k
,
block_n
,
block_k
):
def
get_blockint8json_name
(
self
,
n
,
k
,
block_n
,
block_k
):
from
vllm.platforms
import
current_platform
return
self
.
triton_json_dir
+
f
"/linear_
{
n
}
_
{
k
}
_block[
{
block_n
}
,
{
block_k
}
]_
{
self
.
device_name
}
.json"
device_name
=
current_platform
.
get_device_name
().
replace
(
" "
,
"_"
)
if
'K100_AI'
in
device_name
and
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
multi_processor_count
==
120
:
device_name
=
'K100_AI_120'
return
self
.
triton_json_dir
+
f
"/linear_
{
n
}
_
{
k
}
_block[
{
block_n
}
,
{
block_k
}
]_
{
device_name
}
.json"
def
get_moeblockint8json_name
(
self
,
block_size
,
E
,
N1
,
N2
,
K
,
TOPK
):
return
self
.
triton_json_dir
+
f
"/MOE_BLOCKINT8[
{
block_size
[
0
]
}
,
{
block_size
[
1
]
}
]_E=
{
E
}
_N1=
{
N1
}
_N2=
{
N2
}
_K=
{
K
}
_TOPK
{
TOPK
}
_
{
self
.
device_name
}
.json"
def
get_moeblockint8_triton_cache
(
self
,
file_path
,
block_size
,
E
,
N1
,
N2
,
K
,
TOPK
):
cache_json_file
=
file_path
if
os
.
path
.
exists
(
file_path
):
#try:
with
open
(
cache_json_file
,
'r'
)
as
file
:
cachedata
=
json
.
load
(
file
)
else
:
return
None
#把所有的cache解析成key:config的形式:[M_N_K]:[config1,config2]
configs_dict
=
{}
for
key
,
value
in
cachedata
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_dict
[
configs_key
]
=
sub_value
return
configs_dict
# Adapted from: https://stackoverflow.com/a/47212782/5082708
# Adapted from: https://stackoverflow.com/a/47212782/5082708
class
LazyDict
(
Mapping
[
str
,
T
],
Generic
[
T
]):
class
LazyDict
(
Mapping
[
str
,
T
],
Generic
[
T
]):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment