Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7ef78993
Commit
7ef78993
authored
Dec 19, 2024
by
gaoqiong
Browse files
增加w8a8 triton warmup代码
parent
c56b26cd
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1475 additions
and
5 deletions
+1475
-5
setup.py
setup.py
+1
-1
vllm/_custom_ops.py
vllm/_custom_ops.py
+14
-1
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_12288_4096_K100_AI.json
...rs/quantization/configs/w8a8/W8A8_12288_4096_K100_AI.json
+364
-0
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_22016_4096_K100_AI.json
...rs/quantization/configs/w8a8/W8A8_22016_4096_K100_AI.json
+364
-0
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4096_11008_K100_AI.json
...rs/quantization/configs/w8a8/W8A8_4096_11008_K100_AI.json
+364
-0
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4096_4096_K100_AI.json
...ers/quantization/configs/w8a8/W8A8_4096_4096_K100_AI.json
+364
-0
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+1
-1
vllm/utils.py
vllm/utils.py
+3
-2
No files found.
setup.py
View file @
7ef78993
...
...
@@ -543,7 +543,7 @@ if _build_custom_ops():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C"
))
package_data
=
{
"vllm"
:
[
"py.typed"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"benchmarks/*.py"
]
"vllm"
:
[
"py.typed"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"benchmarks/*.py"
,
"model_executor/layers/quantization/configs/*.json"
]
}
if
envs
.
VLLM_USE_PRECOMPILED
:
ext_modules
=
[]
...
...
vllm/_custom_ops.py
View file @
7ef78993
import
contextlib
import
functools
from
typing
import
List
,
Optional
,
Tuple
,
Union
from
typing
import
List
,
Optional
,
Tuple
,
Union
,
Type
import
torch
...
...
@@ -11,6 +11,7 @@ from vllm.platforms import current_platform
try
:
from
lmslim
import
quant_ops
from
lmslim
import
quant_tools
except
Exception
:
print
(
"INFO: Please install lmslim if you want to infer gptq or awq or w8a8 model.
\n
"
)
...
...
@@ -909,6 +910,18 @@ def triton_scaled_mm(a: torch.Tensor,
return
quant_ops
.
triton_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
def
triton_int8_gemm_helper
(
m
:
int
,
n
:
int
,
k
:
int
,
per_token_act_quant
:
bool
,
per_out_channel_weight_quant
:
bool
,
use_bias
:
bool
,
out_dtype
:
Type
[
torch
.
dtype
]
=
torch
.
float16
,
device
:
str
=
"cuda"
,
best_config
:
Optional
[
list
]
=
None
):
return
quant_tools
.
triton_int8_gemm_helper
(
m
,
n
,
k
,
per_token_act_quant
,
per_out_channel_weight_quant
,
use_bias
,
out_dtype
,
device
,
est_config
)
def
cutlass_scaled_mm_azp
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
...
...
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_12288_4096_K100_AI.json
0 → 100755
View file @
7ef78993
{
"12288_4096"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"13"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"17"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"18"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"19"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"20"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"21"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"22"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"23"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"25"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"26"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"27"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"28"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"29"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"30"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"31"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"8192"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_22016_4096_K100_AI.json
0 → 100755
View file @
7ef78993
{
"22016_4096"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"3"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"4"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"5"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"6"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"7"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"9"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"10"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"11"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"12"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"13"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"14"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"15"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"17"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"18"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"19"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"20"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"21"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"22"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"23"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"25"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"26"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"27"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"28"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"29"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"30"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"31"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"8192"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4096_11008_K100_AI.json
0 → 100755
View file @
7ef78993
{
"4096_11008"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"17"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"18"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"19"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"20"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"21"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"22"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"23"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"25"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"26"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"27"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"28"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"29"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"30"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"31"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"8192"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4096_4096_K100_AI.json
0 → 100755
View file @
7ef78993
{
"4096_4096"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"17"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"18"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"19"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"20"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"21"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"22"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"23"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"25"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"26"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"27"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"28"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"29"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"30"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"31"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
2
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
},
"8192"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
}
}
}
\ No newline at end of file
vllm/model_executor/models/llama.py
View file @
7ef78993
...
...
@@ -691,7 +691,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
m
=
int
(
key
.
split
(
'_'
)[
0
])
n
=
int
(
key
.
split
(
'_'
)[
1
])
k
=
int
(
key
.
split
(
'_'
)[
2
])
ops
.
_int8_gemm_helper
(
m
=
m
,
n
=
n
,
k
=
k
,
per_token_act_quant
=
True
,
per_out_channel_weight_quant
=
True
,
use_bias
=
False
,
best_config
=
value
)
ops
.
triton
_int8_gemm_helper
(
m
=
m
,
n
=
n
,
k
=
k
,
per_token_act_quant
=
True
,
per_out_channel_weight_quant
=
True
,
use_bias
=
False
,
best_config
=
value
)
# If this function is called, it should always initialize KV cache scale
# factors (or else raise an exception). Thus, handled exceptions should
...
...
vllm/utils.py
View file @
7ef78993
...
...
@@ -1345,7 +1345,8 @@ class W8a8GetCacheJSON:
return
cls
.
_instance
def
_initialize
(
self
):
self
.
triton_json_dir
=
(
os
.
getenv
(
'TRITON_JSON_DIR'
,
'./cache'
))
current_folder_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
self
.
triton_json_dir
=
(
os
.
getenv
(
'TRITON_JSON_DIR'
,
current_folder_path
+
'/model_executor/layers/quantization/configs/w8a8'
))
self
.
triton_json_dict
=
[]
def
getspec_config
(
self
,
configs_dict
,
M
,
N
,
K
):
...
...
@@ -1418,5 +1419,5 @@ class W8a8GetCacheJSON:
def
get_w8a8json_name
(
self
,
n
,
k
):
device_name
=
current_platform
.
get_device_name
().
replace
(
" "
,
"_"
)
return
self
.
triton_json_dir
+
f
"/W8A8_
{
n
}
_
{
k
}
_
HCU
{
device_name
}
.json"
return
self
.
triton_json_dir
+
f
"/W8A8_
{
n
}
_
{
k
}
_
{
device_name
}
.json"
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment