Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
47bd229c
Commit
47bd229c
authored
Feb 20, 2025
by
yangql
Browse files
适配deepseekv3\v2 moe awq的推理支持
parent
4a734b9d
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4380 additions
and
95 deletions
+4380
-95
benchmarks/kernels/benchmark_moe_int4.py
benchmarks/kernels/benchmark_moe_int4.py
+714
-0
setup.py
setup.py
+2
-1
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=BW200,dtype=int4_w4a16.json
...nfigs/E=256,N=128,device_name=BW200,dtype=int4_w4a16.json
+164
-0
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
...igs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
+164
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+1
-1
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+70
-15
vllm/model_executor/layers/quantization/awq_triton.py
vllm/model_executor/layers/quantization/awq_triton.py
+93
-78
vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_BW200.json
.../layers/quantization/configs/awq/AWQ_1536_7168_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_K100_AI.json
...ayers/quantization/configs/awq/AWQ_1536_7168_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_BW200.json
.../layers/quantization/configs/awq/AWQ_3072_1536_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_K100_AI.json
...ayers/quantization/configs/awq/AWQ_3072_1536_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_BW200.json
...r/layers/quantization/configs/awq/AWQ_4096_512_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_K100_AI.json
...layers/quantization/configs/awq/AWQ_4096_512_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_BW200.json
.../layers/quantization/configs/awq/AWQ_4608_7168_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_K100_AI.json
...ayers/quantization/configs/awq/AWQ_4608_7168_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_BW200.json
...r/layers/quantization/configs/awq/AWQ_512_7168_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_K100_AI.json
...layers/quantization/configs/awq/AWQ_512_7168_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_BW200.json
...r/layers/quantization/configs/awq/AWQ_576_7168_BW200.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_K100_AI.json
...layers/quantization/configs/awq/AWQ_576_7168_K100_AI.json
+244
-0
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_BW200.json
.../layers/quantization/configs/awq/AWQ_7168_2048_BW200.json
+244
-0
No files found.
benchmarks/kernels/benchmark_moe_int4.py
0 → 100644
View file @
47bd229c
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
time
from
datetime
import
datetime
from
itertools
import
product
from
typing
import
Any
,
Dict
,
List
,
Tuple
,
TypedDict
import
ray
import
torch
import
triton
from
ray.experimental.tqdm_ray
import
tqdm
from
transformers
import
AutoConfig
from
vllm.model_executor.layers.fused_moe.fused_moe
import
*
from
vllm.platforms
import
current_platform
from
vllm.utils
import
FlexibleArgumentParser
FP8_DTYPE
=
torch
.
float8_e4m3fnuz
if
current_platform
.
is_rocm
(
)
else
torch
.
float8_e4m3fn
class
BenchmarkConfig
(
TypedDict
):
BLOCK_SIZE_M
:
int
BLOCK_SIZE_N
:
int
BLOCK_SIZE_K
:
int
GROUP_SIZE_M
:
int
num_warps
:
int
num_stages
:
int
num_ldmatrixes
:
Optional
[
int
]
def
benchmark_config
(
config
:
BenchmarkConfig
,
num_tokens
:
int
,
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
group_size
:
int
,
num_iters
:
int
=
100
,
nn_moe
:
Optional
[
bool
]
=
False
)
->
float
:
init_dtype
=
torch
.
float16
if
use_fp8_w8a8
else
dtype
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
if
use_int8_w8a16
:
if
not
nn_moe
:
w1
=
torch
.
randint
(
-
127
,
127
,
(
num_experts
,
shard_intermediate_size
,
hidden_size
,
),
dtype
=
torch
.
int8
)
w2
=
torch
.
randint
(
-
127
,
127
,
(
num_experts
,
hidden_size
,
shard_intermediate_size
//
2
,
),
dtype
=
torch
.
int8
)
else
:
w1
=
torch
.
randint
(
-
127
,
127
,
(
num_experts
,
hidden_size
,
shard_intermediate_size
),
dtype
=
torch
.
int8
)
w2
=
torch
.
randint
(
-
127
,
127
,
(
num_experts
,
shard_intermediate_size
//
2
,
hidden_size
),
dtype
=
torch
.
int8
)
if
use_int4_w4a16
:
w1
=
torch
.
randint
(
0
,
255
,
(
num_experts
,
shard_intermediate_size
,
hidden_size
//
2
,
),
dtype
=
torch
.
uint8
)
w2
=
torch
.
randint
(
0
,
255
,
(
num_experts
,
hidden_size
,
shard_intermediate_size
//
4
,
),
dtype
=
torch
.
uint8
)
else
:
if
not
nn_moe
:
w1
=
torch
.
randn
(
num_experts
,
shard_intermediate_size
,
hidden_size
,
dtype
=
init_dtype
)
w2
=
torch
.
randn
(
num_experts
,
hidden_size
,
shard_intermediate_size
//
2
,
dtype
=
init_dtype
)
else
:
w1
=
torch
.
randn
(
num_experts
,
hidden_size
,
shard_intermediate_size
,
dtype
=
init_dtype
)
w2
=
torch
.
randn
(
num_experts
,
shard_intermediate_size
//
2
,
hidden_size
,
dtype
=
init_dtype
)
gating_output
=
torch
.
randn
(
num_iters
,
num_tokens
,
num_experts
,
dtype
=
torch
.
float32
)
w1_scale
=
None
w2_scale
=
None
a1_scale
=
None
a2_scale
=
None
w1_zp
=
None
w2_zp
=
None
block_shape
=
None
if
use_int8_w8a16
:
w1_scale
=
torch
.
randn
((
num_experts
,
2
*
shard_intermediate_size
),
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
randn
((
hidden_size
,
num_experts
),
dtype
=
torch
.
float32
)
if
use_fp8_w8a8
:
w1_scale
=
torch
.
randn
(
num_experts
,
dtype
=
torch
.
float32
)
w2_scale
=
torch
.
randn
(
num_experts
,
dtype
=
torch
.
float32
)
a1_scale
=
torch
.
randn
(
1
,
dtype
=
torch
.
float32
)
a2_scale
=
torch
.
randn
(
1
,
dtype
=
torch
.
float32
)
w1
=
w1
.
to
(
FP8_DTYPE
)
w2
=
w2
.
to
(
FP8_DTYPE
)
if
use_int4_w4a16
:
w1_scale
=
torch
.
randn
((
num_experts
,
shard_intermediate_size
,
hidden_size
//
(
group_size
)),
dtype
=
torch
.
float16
)
w2_scale
=
torch
.
randn
((
num_experts
,
hidden_size
,
shard_intermediate_size
//
(
2
*
group_size
)),
dtype
=
torch
.
float16
)
w1_zp
=
torch
.
randint
(
0
,
255
,
(
num_experts
,
shard_intermediate_size
//
2
,
hidden_size
//
(
group_size
),
),
dtype
=
torch
.
uint8
)
w2_zp
=
torch
.
randint
(
0
,
255
,
(
num_experts
,
hidden_size
//
2
,
shard_intermediate_size
//
(
2
*
group_size
),
),
dtype
=
torch
.
uint8
)
nn_moe
=
False
block_shape
=
[
0
,
group_size
]
input_gating
=
torch
.
randn
(
num_tokens
,
num_experts
,
dtype
=
torch
.
float32
)
print
(
f
"shape:
{
x
.
shape
[
0
]
}
| config:
{
config
}
"
)
def
prepare
(
i
:
int
):
input_gating
.
copy_
(
gating_output
[
i
])
def
run
():
from
vllm.model_executor.layers.fused_moe
import
override_config
with
override_config
(
config
):
fused_moe
(
x
,
w1
,
w2
,
input_gating
,
topk
,
renormalize
=
True
,
inplace
=
True
,
use_fp8_w8a8
=
use_fp8_w8a8
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
w1_zp
=
w1_zp
,
w2_zp
=
w2_zp
,
a1_scale
=
a1_scale
,
a2_scale
=
a2_scale
,
use_nn_moe
=
nn_moe
,
block_shape
=
block_shape
,
)
# JIT compilation & warmup
run
()
torch
.
cuda
.
synchronize
()
# Capture 10 invocations with CUDA graph
# graph = torch.cuda.CUDAGraph()
# with torch.cuda.graph(graph):
# for _ in range(10):
# run()
# torch.cuda.synchronize()
# Warmup
for
_
in
range
(
5
):
# graph.replay()
run
()
torch
.
cuda
.
synchronize
()
start_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
end_event
=
torch
.
cuda
.
Event
(
enable_timing
=
True
)
latencies
:
List
[
float
]
=
[]
for
i
in
range
(
num_iters
):
prepare
(
i
)
torch
.
cuda
.
synchronize
()
start_event
.
record
()
# graph.replay()
run
()
end_event
.
record
()
end_event
.
synchronize
()
latencies
.
append
(
start_event
.
elapsed_time
(
end_event
))
avg
=
sum
(
latencies
)
/
(
num_iters
*
10
)
*
1000
# us
print
(
f
"avg:
{
avg
}
"
)
# graph.reset()
return
avg
def
get_rocm_tuning_space
(
use_fp16
,
use_int4_w4a16
,
nn_moe
:
Optional
[
bool
]
=
False
):
if
use_int4_w4a16
:
block_m_range
=
[
16
,
32
,
64
]
block_n_range
=
[
32
,
64
,
128
]
block_k_range
=
[
16
,
32
,
64
]
num_warps_range
=
[
1
,
2
,
4
,
8
]
group_m_range
=
[
1
,
4
,
8
,
16
]
num_stage_range
=
[
2
,
4
]
num_ldmatrixes
=
[
0
]
param_ranges
=
{
"BLOCK_SIZE_M"
:
block_m_range
,
"BLOCK_SIZE_N"
:
block_n_range
,
"BLOCK_SIZE_K"
:
block_k_range
,
"GROUP_SIZE_M"
:
group_m_range
,
"num_warps"
:
num_warps_range
,
"num_stages"
:
num_stage_range
,
"num_ldmatrixes"
:
num_ldmatrixes
,
}
return
param_ranges
block_mn_range
=
[
16
,
32
,
64
,
128
,
256
]
block_k_range
=
[
16
,
32
,
64
,
128
,
256
]
if
not
use_fp16
:
block_k_range
.
remove
(
16
)
# BLOCK_K=16 not supported for fp8
num_warps_range
=
[
1
,
2
,
4
,
8
]
group_m_range
=
[
1
,
4
,
8
,
16
,
32
]
num_stage_range
=
[
2
]
waves_per_eu_range
=
[
0
]
matrix_instr_nonkdim_range
=
[
16
,
32
]
if
use_fp16
else
[]
kpack_range
=
[
1
,
2
]
if
use_fp16
else
[]
param_ranges
=
{
"BLOCK_SIZE_M"
:
block_mn_range
,
"BLOCK_SIZE_N"
:
block_mn_range
,
"BLOCK_SIZE_K"
:
block_k_range
,
"GROUP_SIZE_M"
:
group_m_range
,
"num_warps"
:
num_warps_range
,
"num_stages"
:
num_stage_range
,
"waves_per_eu"
:
waves_per_eu_range
,
}
if
nn_moe
:
param_ranges
[
"num_ldmatrixes"
]
=
1
if
use_fp16
:
param_ranges
[
"matrix_instr_nonkdim"
]
=
matrix_instr_nonkdim_range
param_ranges
[
"kpack"
]
=
kpack_range
return
param_ranges
def
get_configs_compute_bound
(
use_fp16
,
use_int4_w4a16
,
nn_moe
:
Optional
[
bool
]
=
False
)
->
List
[
Dict
[
str
,
int
]]:
configs
:
List
[
BenchmarkConfig
]
=
[]
if
current_platform
.
is_rocm
():
param_ranges
=
get_rocm_tuning_space
(
use_fp16
,
use_int4_w4a16
,
nn_moe
)
else
:
# Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to
# prune the search space.
block_m_range
=
[
16
,
32
,
64
,
128
,
256
]
block_n_range
=
[
32
,
64
,
128
,
256
]
block_k_range
=
[
64
,
128
,
256
]
num_warps_range
=
[
4
,
8
]
group_m_range
=
[
1
,
16
,
32
,
64
]
num_stage_range
=
[
2
,
3
,
4
,
5
]
param_ranges
=
{
"BLOCK_SIZE_M"
:
block_m_range
,
"BLOCK_SIZE_N"
:
block_n_range
,
"BLOCK_SIZE_K"
:
block_k_range
,
"GROUP_SIZE_M"
:
group_m_range
,
"num_warps"
:
num_warps_range
,
"num_stages"
:
num_stage_range
,
}
keys
,
values
=
zip
(
*
param_ranges
.
items
())
for
config_values
in
product
(
*
values
):
config
=
dict
(
zip
(
keys
,
config_values
))
configs
.
append
(
config
)
return
configs
def
prune_rocm_search_space
(
num_tokens
,
shard_intermediate_size
,
hidden_size
,
search_space
,
is_fp16
):
N1
,
K1
=
shard_intermediate_size
,
hidden_size
N2
,
K2
=
hidden_size
,
shard_intermediate_size
//
2
pruned_space_1
=
prune_rocm_configs
(
num_tokens
*
2
,
N1
,
K1
,
search_space
,
is_fp16
)
pruned_space_2
=
prune_rocm_configs
(
num_tokens
*
2
,
N2
,
K2
,
search_space
,
is_fp16
)
search_space
=
merge_unique_dicts
(
pruned_space_1
,
pruned_space_2
)
return
search_space
# The following code is inspired by ROCm/Triton GEMM tuning script:
# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
def
prune_rocm_configs
(
M
,
N
,
K
,
configs
,
is_fp16
=
True
):
pruned_configs
=
[]
elemBytes_a
=
2
if
is_fp16
else
1
elemBytes_b
=
2
if
is_fp16
else
1
mfma
=
16
if
M
<
32
or
N
<
32
else
32
# TODO (zhanglx): figure out the boundary between large and small gemms
large_gemm
=
False
if
M
>=
2048
and
N
>=
2048
:
large_gemm
=
True
for
config
in
configs
:
BLOCK_SIZE_M
=
config
.
get
(
"BLOCK_SIZE_M"
)
BLOCK_SIZE_N
=
config
.
get
(
"BLOCK_SIZE_N"
)
BLOCK_SIZE_K
=
config
.
get
(
"BLOCK_SIZE_K"
)
num_warps
=
config
.
get
(
"num_warps"
)
if
is_fp16
:
matrix_instr_nonkdim
=
config
.
get
(
"matrix_instr_nonkdim"
)
if
matrix_instr_nonkdim
>
mfma
:
continue
if
mfma
==
4
and
BLOCK_SIZE_K
<
64
:
continue
# some layouts could not work properly in case
# number elements per thread is less 1
if
BLOCK_SIZE_M
*
BLOCK_SIZE_N
<
64
:
continue
SPLIT_K
=
config
.
get
(
"SPLIT_K"
,
1
)
GROUP_M
=
config
.
get
(
"GROUP_SIZE_M"
)
if
is_fp16
:
if
(
matrix_instr_nonkdim
>
BLOCK_SIZE_M
or
matrix_instr_nonkdim
>
BLOCK_SIZE_N
):
continue
if
(
matrix_instr_nonkdim
>=
M
and
matrix_instr_nonkdim
!=
BLOCK_SIZE_M
):
continue
if
(
matrix_instr_nonkdim
>=
N
and
matrix_instr_nonkdim
!=
BLOCK_SIZE_N
):
continue
# Skip BLOCK_SIZE that is too large compare to M/N
# unless BLOCK_SIZE is already small enough
if
M
*
2
<
BLOCK_SIZE_M
and
BLOCK_SIZE_M
!=
16
:
continue
if
N
*
2
<
BLOCK_SIZE_N
and
BLOCK_SIZE_N
!=
16
:
continue
# skip large split_k when not necessary
if
SPLIT_K
!=
1
and
not
need_split_k
(
M
,
N
,
K
):
continue
# skip split_k that leads to EVEN_K = false
leap
=
SPLIT_K
*
BLOCK_SIZE_K
modv
=
K
%
leap
if
modv
!=
0
:
continue
# skip large GROUP_M
if
GROUP_M
*
BLOCK_SIZE_M
>
M
and
GROUP_M
!=
1
:
continue
# out of shared memory resource
# TODO (zhanglx): This does not consider the LDS usage in the epilogue
LDS
=
(
BLOCK_SIZE_K
*
BLOCK_SIZE_M
*
elemBytes_a
+
BLOCK_SIZE_K
*
BLOCK_SIZE_N
*
elemBytes_b
)
if
LDS
>
65536
:
continue
# Skip small block sizes and num_warps for large gemm
# For fp16 and f8, we want to only use BLOCK_SIZE >= 64
if
large_gemm
:
if
BLOCK_SIZE_M
<
64
or
BLOCK_SIZE_N
<
64
:
continue
if
BLOCK_SIZE_K
<
64
:
continue
if
num_warps
<
4
:
continue
pruned_configs
.
append
(
config
)
return
pruned_configs
def
need_split_k
(
SIZE_M
,
SIZE_N
,
SIZE_K
):
return
(
SIZE_M
<
64
or
SIZE_N
<
64
)
and
SIZE_K
>
1024
def
merge_unique_dicts
(
list1
,
list2
):
result
=
[]
combined_list
=
list1
.
copy
()
combined_list
.
extend
(
list2
)
for
dictionary
in
combined_list
:
if
dictionary
not
in
result
:
result
.
append
(
dictionary
)
return
result
@
ray
.
remote
(
num_gpus
=
1
)
class
BenchmarkWorker
:
def
__init__
(
self
,
seed
:
int
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
current_platform
.
seed_everything
(
seed
)
self
.
seed
=
seed
# Get the device ID to allocate tensors and kernels
# on the respective GPU. This is required for Ray to work
# correctly with multi-GPU tuning on the ROCm platform.
self
.
device_id
=
int
(
ray
.
get_gpu_ids
()[
0
])
def
benchmark
(
self
,
num_tokens
:
int
,
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
group_size
:
int
,
)
->
Tuple
[
Dict
[
str
,
int
],
float
]:
current_platform
.
seed_everything
(
self
.
seed
)
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int8_w8a16
=
use_int8_w8a16
,
use_fp8_w8a8
=
use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
config_shard_intermediate_size
=
shard_intermediate_size
if
use_int4_w4a16
:
config_shard_intermediate_size
=
shard_intermediate_size
//
2
op_config
=
get_moe_configs
(
num_experts
,
config_shard_intermediate_size
//
2
,
dtype_str
)
if
op_config
is
None
:
config
=
get_default_config
(
num_tokens
,
num_experts
,
config_shard_intermediate_size
,
hidden_size
,
topk
,
dtype_str
,
is_marlin
=
False
)
else
:
config
=
op_config
[
min
(
op_config
.
keys
(),
key
=
lambda
x
:
abs
(
x
-
num_tokens
))]
kernel_time
=
benchmark_config
(
config
,
num_tokens
,
num_experts
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8_w8a8
,
use_int8_w8a16
,
use_int4_w4a16
,
group_size
)
return
config
,
kernel_time
def
tune
(
self
,
num_tokens
:
int
,
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
group_size
:
int
,
search_space
:
List
[
Dict
[
str
,
int
]],
nn_moe
:
Optional
[
bool
]
=
False
)
->
Dict
[
str
,
int
]:
best_config
=
None
best_time
=
float
(
"inf"
)
if
current_platform
.
is_rocm
():
is_fp16
=
not
(
use_fp8_w8a8
or
use_int8_w8a16
or
use_int4_w4a16
)
search_space
=
prune_rocm_search_space
(
num_tokens
,
shard_intermediate_size
,
hidden_size
,
search_space
,
is_fp16
)
with
torch
.
cuda
.
device
(
self
.
device_id
):
for
config
in
tqdm
(
search_space
):
try
:
kernel_time
=
benchmark_config
(
config
,
num_tokens
,
num_experts
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8_w8a8
,
use_int8_w8a16
,
use_int4_w4a16
,
group_size
,
num_iters
=
20
,
nn_moe
=
nn_moe
)
except
triton
.
runtime
.
autotuner
.
OutOfResources
:
# Some configurations may be invalid and fail to compile.
continue
if
kernel_time
<
best_time
:
best_time
=
kernel_time
best_config
=
config
now
=
datetime
.
now
()
print
(
f
"
{
now
.
ctime
()
}
] Completed tuning for batch_size=
{
num_tokens
}
"
)
assert
best_config
is
not
None
return
best_config
def
sort_config
(
config
:
BenchmarkConfig
)
->
BenchmarkConfig
:
if
"num_ldmatrixes"
not
in
config
:
return
{
"BLOCK_SIZE_M"
:
config
[
"BLOCK_SIZE_M"
],
"BLOCK_SIZE_N"
:
config
[
"BLOCK_SIZE_N"
],
"BLOCK_SIZE_K"
:
config
[
"BLOCK_SIZE_K"
],
"GROUP_SIZE_M"
:
config
[
"GROUP_SIZE_M"
],
"num_warps"
:
config
[
"num_warps"
],
"num_stages"
:
config
[
"num_stages"
],
**
({
"waves_per_eu"
:
config
[
"waves_per_eu"
]
}
if
"waves_per_eu"
in
config
else
{}),
**
({
"matrix_instr_nonkdim"
:
config
[
"matrix_instr_nonkdim"
]
}
if
"matrix_instr_nonkdim"
in
config
else
{}),
**
({
"kpack"
:
config
[
"kpack"
]
}
if
"kpack"
in
config
else
{}),
}
else
:
return
{
"BLOCK_SIZE_M"
:
config
[
"BLOCK_SIZE_M"
],
"BLOCK_SIZE_N"
:
config
[
"BLOCK_SIZE_N"
],
"BLOCK_SIZE_K"
:
config
[
"BLOCK_SIZE_K"
],
"GROUP_SIZE_M"
:
config
[
"GROUP_SIZE_M"
],
"num_warps"
:
config
[
"num_warps"
],
"num_stages"
:
config
[
"num_stages"
],
"num_ldmatrixes"
:
config
[
"num_ldmatrixes"
],
**
({
"waves_per_eu"
:
config
[
"waves_per_eu"
]
}
if
"waves_per_eu"
in
config
else
{}),
**
({
"matrix_instr_nonkdim"
:
config
[
"matrix_instr_nonkdim"
]
}
if
"matrix_instr_nonkdim"
in
config
else
{}),
**
({
"kpack"
:
config
[
"kpack"
]
}
if
"kpack"
in
config
else
{}),
}
def
save_configs
(
configs
:
Dict
[
int
,
BenchmarkConfig
],
num_experts
:
int
,
shard_intermediate_size
:
int
,
hidden_size
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
use_fp8_w8a8
:
bool
,
use_int8_w8a16
:
bool
,
use_int4_w4a16
:
bool
,
use_nn_moe
:
Optional
[
bool
]
=
False
)
->
None
:
dtype_str
=
get_config_dtype_str
(
dtype
,
use_int8_w8a16
=
use_int8_w8a16
,
use_int4_w4a16
=
use_int4_w4a16
,
use_fp8_w8a8
=
use_fp8_w8a8
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
filename
=
get_config_file_name
(
num_experts
,
shard_intermediate_size
//
2
,
dtype_str
,
use_nn_moe
=
use_nn_moe
)
print
(
f
"Writing best config to
{
filename
}
..."
)
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
configs
,
f
,
indent
=
4
)
f
.
write
(
"
\n
"
)
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
config
=
AutoConfig
.
from_pretrained
(
args
.
model
,
trust_remote_code
=
args
.
trust_remote_code
)
group_size
=
None
if
config
.
architectures
[
0
]
==
"DbrxForCausalLM"
:
E
=
config
.
ffn_config
.
moe_num_experts
topk
=
config
.
ffn_config
.
moe_top_k
intermediate_size
=
config
.
ffn_config
.
ffn_hidden_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
elif
config
.
architectures
[
0
]
==
"JambaForCausalLM"
:
E
=
config
.
num_experts
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
elif
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
"DeepseekV3ForCausalLM"
:
E
=
config
.
n_routed_experts
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
moe_intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
if
config
.
quantization_config
[
'quant_method'
]
==
"awq"
:
group_size
=
config
.
quantization_config
[
"group_size"
]
else
:
# Default: Mixtral.
E
=
config
.
num_local_experts
topk
=
config
.
num_experts_per_tok
intermediate_size
=
config
.
intermediate_size
shard_intermediate_size
=
2
*
intermediate_size
//
args
.
tp_size
hidden_size
=
config
.
hidden_size
dtype
=
torch
.
float16
if
current_platform
.
is_rocm
()
else
config
.
torch_dtype
use_fp8_w8a8
=
args
.
dtype
==
"fp8_w8a8"
use_int8_w8a16
=
args
.
dtype
==
"int8_w8a16"
use_int4_w4a16
=
args
.
dtype
==
"int4_w4a16"
if
args
.
batch_size
is
None
:
batch_sizes
=
[
1
,
2
,
4
,
8
,
16
,
24
,
32
,
48
,
64
,
96
,
128
,
256
,
512
,
1024
,
1536
,
2048
,
3072
,
4096
]
else
:
batch_sizes
=
[
args
.
batch_size
]
ray
.
init
(
address
=
None
,
ignore_reinit_error
=
True
,
num_gpus
=
1
)
num_gpus
=
int
(
ray
.
available_resources
()[
"GPU"
])
workers
=
[
BenchmarkWorker
.
remote
(
args
.
seed
)
for
_
in
range
(
num_gpus
)]
def
_distribute
(
method
:
str
,
inputs
:
List
[
Any
])
->
List
[
Any
]:
outputs
=
[]
worker_idx
=
0
for
input_args
in
inputs
:
worker
=
workers
[
worker_idx
]
worker_method
=
getattr
(
worker
,
method
)
output
=
worker_method
.
remote
(
*
input_args
)
outputs
.
append
(
output
)
worker_idx
=
(
worker_idx
+
1
)
%
num_gpus
return
ray
.
get
(
outputs
)
if
args
.
tune
:
is_fp16
=
not
(
use_fp8_w8a8
or
use_int8_w8a16
or
use_int4_w4a16
)
search_space
=
get_configs_compute_bound
(
is_fp16
,
use_int4_w4a16
,
args
.
nn_moe
)
print
(
f
"Start tuning over
{
len
(
search_space
)
}
configurations..."
)
start
=
time
.
time
()
configs
=
_distribute
(
"tune"
,
[(
batch_size
,
E
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8_w8a8
,
use_int8_w8a16
,
use_int4_w4a16
,
group_size
,
search_space
,
args
.
nn_moe
)
for
batch_size
in
batch_sizes
])
best_configs
=
{
M
:
sort_config
(
config
)
for
M
,
config
in
zip
(
batch_sizes
,
configs
)
}
if
use_int4_w4a16
:
save_configs
(
best_configs
,
E
,
shard_intermediate_size
//
2
,
hidden_size
,
topk
,
dtype
,
use_fp8_w8a8
,
use_int8_w8a16
,
use_int4_w4a16
,
use_nn_moe
=
args
.
nn_moe
)
else
:
save_configs
(
best_configs
,
E
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8_w8a8
,
use_int8_w8a16
,
use_nn_moe
=
args
.
nn_moe
)
end
=
time
.
time
()
print
(
f
"Tuning took
{
end
-
start
:.
2
f
}
seconds"
)
else
:
outputs
=
_distribute
(
"benchmark"
,
[(
batch_size
,
E
,
shard_intermediate_size
,
hidden_size
,
topk
,
dtype
,
use_fp8_w8a8
,
use_int8_w8a16
,
use_int4_w4a16
,
group_size
)
for
batch_size
in
batch_sizes
])
for
batch_size
,
(
config
,
kernel_time
)
in
zip
(
batch_sizes
,
outputs
):
print
(
f
"Batch size:
{
batch_size
}
, config:
{
config
}
"
)
print
(
f
"Kernel time:
{
kernel_time
:.
2
f
}
us"
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"/home/yang/llm-models/vllm-awq-models/DeepSeek-R1-AWQ/"
)
parser
.
add_argument
(
"--tp-size"
,
"-tp"
,
"--tensor-parallel-size"
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
"--dtype"
,
type
=
str
,
choices
=
[
"auto"
,
"fp8_w8a8"
,
"int8_w8a16"
,
"int4_w4a16"
],
default
=
"int4_w4a16"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
required
=
False
)
parser
.
add_argument
(
"--tune"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--nn_moe"
,
type
=
bool
,
default
=
False
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
default
=
True
)
args
=
parser
.
parse_args
()
main
(
args
)
setup.py
View file @
47bd229c
...
@@ -688,7 +688,8 @@ package_data = {
...
@@ -688,7 +688,8 @@ package_data = {
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"benchmarks/*.py"
,
"benchmarks/*.py"
,
"model_executor/layers/quantization/configs/w8a8/*.json"
"model_executor/layers/quantization/configs/w8a8/*.json"
,
"model_executor/layers/quantization/configs/awq/*.json"
]
]
}
}
...
...
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=BW200,dtype=int4_w4a16.json
0 → 100644
View file @
47bd229c
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"48"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"96"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
2
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
}
}
vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
0 → 100644
View file @
47bd229c
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"24"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"48"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"96"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"128"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"256"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"512"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"1024"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"1536"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"2048"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"3072"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
},
"4096"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"num_warps"
:
8
,
"num_stages"
:
2
,
"num_ldmatrixes"
:
0
}
}
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
47bd229c
...
@@ -1066,7 +1066,7 @@ def get_config_dtype_str(dtype: torch.dtype,
...
@@ -1066,7 +1066,7 @@ def get_config_dtype_str(dtype: torch.dtype,
elif
use_int8_w8a16
:
elif
use_int8_w8a16
:
return
"int8_w8a16"
return
"int8_w8a16"
elif
use_int4_w4a16
:
elif
use_int4_w4a16
:
return
"int4_w
8
a16"
return
"int4_w
4
a16"
elif
dtype
==
torch
.
float
:
elif
dtype
==
torch
.
float
:
# avoiding cases where kernel fails when float32 MoE
# avoiding cases where kernel fails when float32 MoE
# use fp16/bfloat16 configs
# use fp16/bfloat16 configs
...
...
vllm/model_executor/layers/quantization/awq.py
View file @
47bd229c
...
@@ -5,7 +5,10 @@ from typing import Any, Dict, List, Optional
...
@@ -5,7 +5,10 @@ from typing import Any, Dict, List, Optional
import
torch
import
torch
import
os
import
os
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
import
vllm.envs
as
envs
import
json
import
math
from
vllm.platforms
import
current_platform
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
UnquantizedLinearMethod
)
UnquantizedLinearMethod
)
...
@@ -13,6 +16,57 @@ from vllm.model_executor.layers.quantization.base_config import (
...
@@ -13,6 +16,57 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig
)
QuantizationConfig
)
from
vllm.model_executor.parameter
import
(
GroupQuantScaleParameter
,
from
vllm.model_executor.parameter
import
(
GroupQuantScaleParameter
,
PackedvLLMParameter
)
PackedvLLMParameter
)
from
vllm.model_executor.layers.quantization.awq_triton
import
awq_gemm_triton
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
triton_configs_dict
=
{}
def
get_triton_cache
(
file_path
):
#会将所报错的json文件以字典的形式return出来
if
os
.
path
.
exists
(
file_path
):
with
open
(
file_path
,
'r'
)
as
file
:
cachedata
=
json
.
load
(
file
)
#把所有的cache解析成key:config的形式:[M_N_K]:[config]
for
key
,
value
in
cachedata
.
items
():
for
sub_key
,
sub_value
in
value
.
items
():
configs_key
=
f
"
{
sub_key
}
_
{
key
}
"
configs_value
=
{
'SPLIT_K'
:
int
(
sub_value
[
"SPLIT_K"
]),
'BLOCK_SIZE_M'
:
int
(
sub_value
[
"BLOCK_SIZE_M"
]),
'BLOCK_SIZE_N'
:
int
(
sub_value
[
"BLOCK_SIZE_N"
]),
'BLOCK_SIZE_K'
:
int
(
sub_value
[
"BLOCK_SIZE_K"
]),
'GROUP_SIZE_M'
:
int
(
sub_value
[
"GROUP_SIZE_M"
]),
'num_stages'
:
int
(
sub_value
[
'num_stages'
]),
'num_warps'
:
int
(
sub_value
[
'num_warps'
])
}
if
'num_ldmatrixes'
in
sub_value
:
configs_value
[
"num_ldmatrixes"
]
=
int
(
sub_value
[
'num_ldmatrixes'
])
triton_configs_dict
[
configs_key
]
=
configs_value
logger
.
info
(
"%s have loaded!"
,
file_path
)
def
default_execution
(
k
,
n
):
configs_key
=
f
"1_
{
n
}
_
{
k
}
"
if
configs_key
in
triton_configs_dict
:
return
script_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
cache_json_file
=
f
"
{
script_dir
}
/configs/awq/"
device_name
=
current_platform
.
get_device_name
().
replace
(
" "
,
"_"
)
filename
=
f
"AWQ_
{
n
}
_
{
k
}
_
{
device_name
}
.json"
file_full_path
=
os
.
path
.
join
(
cache_json_file
,
filename
)
if
os
.
path
.
isfile
(
file_full_path
)
and
file_full_path
.
endswith
(
".json"
):
# 如果是文件,则添加到列表
get_triton_cache
(
file_full_path
)
return
def
getspec_config
(
M
,
N
,
K
):
if
f
"
{
M
}
_
{
N
}
_
{
K
}
"
in
triton_configs_dict
:
return
triton_configs_dict
[
f
"
{
M
}
_
{
N
}
_
{
K
}
"
]
else
:
return
None
class
AWQShareWorkSpace
:
class
AWQShareWorkSpace
:
...
@@ -111,7 +165,6 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -111,7 +165,6 @@ class AWQLinearMethod(LinearMethodBase):
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
awqsingleton
=
AWQShareWorkSpace
()
self
.
awqsingleton
=
AWQShareWorkSpace
()
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
AWQ_CK_GEMMBS
=
int
(
os
.
getenv
(
'AWQ_CK_GEMMBS'
,
'20000'
))
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
input_size_per_partition
:
int
,
...
@@ -178,7 +231,9 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -178,7 +231,9 @@ class AWQLinearMethod(LinearMethodBase):
layer
.
register_parameter
(
"qzeros"
,
qzeros
)
layer
.
register_parameter
(
"qzeros"
,
qzeros
)
layer
.
register_parameter
(
"scales"
,
scales
)
layer
.
register_parameter
(
"scales"
,
scales
)
layer
.
register_parameter
(
"zeros_and_scales"
,
zeros_and_scales
)
layer
.
register_parameter
(
"zeros_and_scales"
,
zeros_and_scales
)
# 加载triton_config
if
envs
.
VLLM_USE_TRITON_AWQ
:
default_execution
(
input_size_per_partition
,
output_size_per_partition
)
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
.
qweight
=
torch
.
nn
.
Parameter
(
layer
.
qweight
.
data
,
layer
.
qweight
=
torch
.
nn
.
Parameter
(
layer
.
qweight
.
data
,
...
@@ -196,7 +251,9 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -196,7 +251,9 @@ class AWQLinearMethod(LinearMethodBase):
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
qweight
=
layer
.
qweight
qweight
=
layer
.
qweight
zeros_and_scales
=
layer
.
zeros_and_scales
zeros_and_scales
=
layer
.
zeros_and_scales
qzeros
=
layer
.
qzeros
scales
=
layer
.
scales
pack_factor
=
self
.
quant_config
.
pack_factor
out_shape
=
(
x
.
shape
[:
-
1
]
+
(
qweight
.
shape
[
0
]
*
1
,
))
out_shape
=
(
x
.
shape
[:
-
1
]
+
(
qweight
.
shape
[
0
]
*
1
,
))
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
...
@@ -211,8 +268,15 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -211,8 +268,15 @@ class AWQLinearMethod(LinearMethodBase):
padding_group
=
0
padding_group
=
0
else
:
else
:
padding_group
=
0
padding_group
=
0
if
m
<=
self
.
AWQ_CK_GEMMBS
:
if
envs
.
VLLM_USE_TRITON_AWQ
:
if
m
>
16
:
m
=
2
**
math
.
ceil
(
math
.
log2
(
m
))
best_config
=
getspec_config
(
m
,
n
,
k
)
out
=
awq_gemm_triton
(
reshaped_x
,
qweight
,
scales
,
qzeros
,
pack_factor
,
best_config
)
out_shape
=
(
x
.
shape
[:
-
1
]
+
(
qweight
.
shape
[
1
]
*
8
,
))
else
:
out
=
ops
.
awq_gemm
(
reshaped_x
,
out
=
ops
.
awq_gemm
(
reshaped_x
,
qweight
,
qweight
,
zeros_and_scales
,
zeros_and_scales
,
...
@@ -223,15 +287,6 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -223,15 +287,6 @@ class AWQLinearMethod(LinearMethodBase):
padding_group
,
padding_group
,
self
.
awqsingleton
.
awqworkshapce
,
self
.
awqsingleton
.
awqworkshapce
,
self
.
awqsingleton
.
awqworkshapcesize
)
self
.
awqsingleton
.
awqworkshapcesize
)
else
:
#下面是采用rocblas的做法
deqweight
=
ops
.
dequant_w4_gemm_colmajor
(
# shape[n, k/8] ---> [n,k]
qweight
,
zeros_and_scales
,
k
+
padding_group
*
self
.
quant_config
.
group_size
,
n
,
self
.
quant_config
.
group_size
)
out
=
F
.
linear
(
reshaped_x
,
deqweight
[:,
0
:
k
])
if
bias
is
not
None
:
if
bias
is
not
None
:
out
.
add_
(
bias
)
out
.
add_
(
bias
)
...
...
vllm/model_executor/layers/quantization/awq_triton.py
View file @
47bd229c
...
@@ -44,10 +44,14 @@ def awq_dequantize_kernel(
...
@@ -44,10 +44,14 @@ def awq_dequantize_kernel(
result_masks
=
result_masks_y
[:,
None
]
&
result_masks_x
[
None
,
:]
result_masks
=
result_masks_y
[:,
None
]
&
result_masks_x
[
None
,
:]
# Load the weights.
# Load the weights.
iweights
=
tl
.
load
(
qweight_ptr
+
offsets
,
masks
,
0.0
)
iweights
=
tl
.
load
(
qweight_ptr
+
offsets
,
masks
)
iweights
=
tl
.
interleave
(
iweights
,
iweights
)
iweights
=
tl
.
interleave
(
iweights
,
iweights
)
iweights
=
tl
.
join
(
iweights
,
iweights
).
reshape
(
iweights
.
shape
[:
-
1
]
+
[
2
*
iweights
.
shape
[
-
1
]])
iweights
=
tl
.
interleave
(
iweights
,
iweights
)
iweights
=
tl
.
join
(
iweights
,
iweights
).
reshape
(
iweights
.
shape
[:
-
1
]
+
[
2
*
iweights
.
shape
[
-
1
]])
iweights
=
tl
.
join
(
iweights
,
iweights
).
reshape
(
iweights
.
shape
[:
-
1
]
+
[
2
*
iweights
.
shape
[
-
1
]])
# iweights = tl.interleave(iweights, iweights)
# iweights = tl.interleave(iweights, iweights)
# iweights = tl.interleave(iweights, iweights)
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# that will map given indices to the correct order.
# that will map given indices to the correct order.
...
@@ -73,10 +77,14 @@ def awq_dequantize_kernel(
...
@@ -73,10 +77,14 @@ def awq_dequantize_kernel(
zero_masks
=
zero_masks_y
[:,
None
]
&
zero_masks_x
[
None
,
:]
zero_masks
=
zero_masks_y
[:,
None
]
&
zero_masks_x
[
None
,
:]
# Load the zeros.
# Load the zeros.
zeros
=
tl
.
load
(
zeros_ptr
+
zero_offsets
,
zero_masks
,
0.0
)
zeros
=
tl
.
load
(
zeros_ptr
+
zero_offsets
,
zero_masks
)
zeros
=
tl
.
interleave
(
zeros
,
zeros
)
# zeros = tl.interleave(zeros, zeros)
zeros
=
tl
.
interleave
(
zeros
,
zeros
)
# zeros = tl.interleave(zeros, zeros)
zeros
=
tl
.
interleave
(
zeros
,
zeros
)
# zeros = tl.interleave(zeros, zeros)
zeros
=
tl
.
join
(
zeros
,
zeros
).
reshape
(
zeros
.
shape
[:
-
1
]
+
[
2
*
zeros
.
shape
[
-
1
]])
zeros
=
tl
.
join
(
zeros
,
zeros
).
reshape
(
zeros
.
shape
[:
-
1
]
+
[
2
*
zeros
.
shape
[
-
1
]])
zeros
=
tl
.
join
(
zeros
,
zeros
).
reshape
(
zeros
.
shape
[:
-
1
]
+
[
2
*
zeros
.
shape
[
-
1
]])
zeros
=
tl
.
broadcast_to
(
zeros
,
(
BLOCK_SIZE_Y
,
BLOCK_SIZE_X
*
8
))
zeros
=
tl
.
broadcast_to
(
zeros
,
(
BLOCK_SIZE_Y
,
BLOCK_SIZE_X
*
8
))
# Unpack and reorder: shift out the correct 4-bit value and mask.
# Unpack and reorder: shift out the correct 4-bit value and mask.
...
@@ -93,7 +101,7 @@ def awq_dequantize_kernel(
...
@@ -93,7 +101,7 @@ def awq_dequantize_kernel(
scale_masks
=
scale_masks_y
[:,
None
]
&
scale_masks_x
[
None
,
:]
scale_masks
=
scale_masks_y
[:,
None
]
&
scale_masks_x
[
None
,
:]
# Load the scales.
# Load the scales.
scales
=
tl
.
load
(
scales_ptr
+
scale_offsets
,
scale_masks
,
0.0
)
scales
=
tl
.
load
(
scales_ptr
+
scale_offsets
,
scale_masks
)
scales
=
tl
.
broadcast_to
(
scales
,
(
BLOCK_SIZE_Y
,
BLOCK_SIZE_X
*
8
))
scales
=
tl
.
broadcast_to
(
scales
,
(
BLOCK_SIZE_Y
,
BLOCK_SIZE_X
*
8
))
# Dequantize.
# Dequantize.
...
@@ -108,19 +116,26 @@ def awq_dequantize_kernel(
...
@@ -108,19 +116,26 @@ def awq_dequantize_kernel(
def
awq_gemm_kernel
(
a_ptr
,
b_ptr
,
c_ptr
,
zeros_ptr
,
scales_ptr
,
M
,
N
,
K
,
def
awq_gemm_kernel
(
a_ptr
,
b_ptr
,
c_ptr
,
zeros_ptr
,
scales_ptr
,
M
,
N
,
K
,
group_size
,
BLOCK_SIZE_M
:
tl
.
constexpr
,
group_size
,
BLOCK_SIZE_M
:
tl
.
constexpr
,
BLOCK_SIZE_N
:
tl
.
constexpr
,
BLOCK_SIZE_K
:
tl
.
constexpr
,
BLOCK_SIZE_N
:
tl
.
constexpr
,
BLOCK_SIZE_K
:
tl
.
constexpr
,
SPLIT_K
:
tl
.
constexpr
):
GROUP_SIZE_M
:
tl
.
constexpr
,
SPLIT_K
:
tl
.
constexpr
):
pid
=
tl
.
program_id
(
axis
=
0
)
pid
=
tl
.
program_id
(
axis
=
0
)
pid_z
=
tl
.
program_id
(
1
)
pid_z
=
tl
.
program_id
(
1
)
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
num_pid_m
=
tl
.
cdiv
(
M
,
BLOCK_SIZE_M
)
# num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
num_pid_n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE_N
)
num_pid_n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE_N
)
if
GROUP_SIZE_M
==
1
:
pid_m
=
pid
//
num_pid_n
pid_m
=
pid
//
num_pid_n
pid_n
=
pid
%
num_pid_n
pid_n
=
pid
%
num_pid_n
else
:
accumulator_dtype
=
c_ptr
.
type
.
element_ty
num_pid_in_group
=
GROUP_SIZE_M
*
num_pid_n
group_id
=
pid
//
num_pid_in_group
first_pid_m
=
group_id
*
GROUP_SIZE_M
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
pid_m
=
first_pid_m
+
(
pid
%
group_size_m
)
pid_n
=
(
pid
%
num_pid_in_group
)
//
group_size_m
# accumulator_dtype = c_ptr.type.element_ty
BLOCK_SIZE_N_8
=
BLOCK_SIZE_N
//
8
N_8
=
N
//
8
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
# accumulator = tl.arange(0, BLOCK_SIZE_N)
# accumulator = tl.arange(0, BLOCK_SIZE_N)
# accumulator = tl.broadcast_to(accumulator[None, :],
# accumulator = tl.broadcast_to(accumulator[None, :],
...
@@ -128,16 +143,16 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
...
@@ -128,16 +143,16 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
# accumulator = accumulator & 0x0
# accumulator = accumulator & 0x0
# accumulator = accumulator.to(accumulator_dtype)
# accumulator = accumulator.to(accumulator_dtype)
accumulator
=
tl
.
zeros
((
BLOCK_SIZE_M
,
BLOCK_SIZE_N
),
accumulator
=
tl
.
zeros
((
BLOCK_SIZE_M
,
BLOCK_SIZE_N
),
dtype
=
accumulator_dtype
)
dtype
=
tl
.
float32
)
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# that will map given indices to the correct order.
# that will map given indices to the correct order.
reverse_awq_order_tensor
=
((
tl
.
arange
(
0
,
2
)
*
4
)[
None
,
:]
+
shifts
=
((
tl
.
arange
(
0
,
2
)
*
16
)[
None
,
:]
+
tl
.
arange
(
0
,
4
)[:,
None
]).
reshape
(
8
)
(
tl
.
arange
(
0
,
4
)
*
4
)[:,
None
]).
reshape
(
1
,
8
)
# Create the necessary shifts to use to unpack.
# Create the necessary shifts to use to unpack.
shifts
=
reverse_awq_order_tensor
*
4
#
shifts = reverse_awq_order_tensor * 4
shifts
=
tl
.
broadcast_to
(
shifts
[
None
,
:]
,
shifts
=
tl
.
broadcast_to
(
shifts
,
(
BLOCK_SIZE_K
*
(
BLOCK_SIZE_N
//
8
),
8
))
(
BLOCK_SIZE_K
*
(
BLOCK_SIZE_N
//
8
),
8
))
shifts
=
tl
.
reshape
(
shifts
,
(
BLOCK_SIZE_K
,
BLOCK_SIZE_N
))
shifts
=
tl
.
reshape
(
shifts
,
(
BLOCK_SIZE_K
,
BLOCK_SIZE_N
))
...
@@ -145,18 +160,15 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
...
@@ -145,18 +160,15 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
offsets_am
=
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
offsets_am
=
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
masks_am
=
offsets_am
<
M
masks_am
=
offsets_am
<
M
offsets_bn
=
pid_n
*
(
BLOCK_SIZE_N
//
8
)
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
//
8
)
offsets_bzn
=
pid_n
*
(
BLOCK_SIZE_N_8
)
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
//
8
)
masks_bn
=
offsets_bn
<
N
//
8
masks_bzn
=
offsets_bzn
<
N_8
offsets_zn
=
pid_n
*
(
BLOCK_SIZE_N
//
8
)
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
//
8
)
masks_zn
=
offsets_zn
<
N
//
8
offsets_sn
=
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
)
offsets_sn
=
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
)
masks_sn
=
offsets_sn
<
N
masks_sn
=
offsets_sn
<
N
offsets_k
=
pid_z
*
BLOCK_SIZE_K
+
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
offsets_k
=
pid_z
*
BLOCK_SIZE_K
+
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
offsets_a
=
K
*
offsets_am
[:,
None
]
+
offsets_k
[
None
,
:]
offsets_a
=
K
*
offsets_am
[:,
None
]
+
offsets_k
[
None
,
:]
offsets_b
=
(
N
//
8
)
*
offsets_k
[:,
None
]
+
offsets_bn
[
None
,
:]
offsets_b
=
(
N
_
8
)
*
offsets_k
[:,
None
]
+
offsets_b
z
n
[
None
,
:]
a_ptrs
=
a_ptr
+
offsets_a
a_ptrs
=
a_ptr
+
offsets_a
b_ptrs
=
b_ptr
+
offsets_b
b_ptrs
=
b_ptr
+
offsets_b
...
@@ -167,33 +179,40 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
...
@@ -167,33 +179,40 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
for
k
in
range
(
0
,
tl
.
cdiv
(
K
,
BLOCK_SIZE_K
*
SPLIT_K
)):
for
k
in
range
(
0
,
tl
.
cdiv
(
K
,
BLOCK_SIZE_K
*
SPLIT_K
)):
masks_k
=
offsets_k
<
K
masks_k
=
offsets_k
<
K
masks_a
=
masks_am
[:,
None
]
&
masks_k
[
None
,
:]
masks_a
=
masks_am
[:,
None
]
&
masks_k
[
None
,
:]
a
=
tl
.
load
(
a_ptrs
,
mask
=
masks_a
,
other
=
0.0
)
a
=
tl
.
load
(
a_ptrs
,
mask
=
masks_a
)
masks_b
=
masks_k
[:,
None
]
&
masks_bn
[
None
,
:]
masks_b
=
masks_k
[:,
None
]
&
masks_bzn
[
None
,
:]
b
=
tl
.
load
(
b_ptrs
,
mask
=
masks_b
,
other
=
0.0
)
b
=
tl
.
load
(
b_ptrs
,
mask
=
masks_b
)
b
=
tl
.
interleave
(
b
,
b
)
# b = tl.interleave(b, b)
b
=
tl
.
interleave
(
b
,
b
)
# b = tl.interleave(b, b)
b
=
tl
.
interleave
(
b
,
b
)
# b = tl.interleave(b, b)
b
=
tl
.
join
(
b
,
b
).
reshape
(
b
.
shape
[:
-
1
]
+
[
2
*
b
.
shape
[
-
1
]])
b
=
tl
.
join
(
b
,
b
).
reshape
(
b
.
shape
[:
-
1
]
+
[
2
*
b
.
shape
[
-
1
]])
b
=
tl
.
join
(
b
,
b
).
reshape
(
b
.
shape
[:
-
1
]
+
[
2
*
b
.
shape
[
-
1
]])
# Dequantize b.
# Dequantize b.
offsets_szk
=
(
offsets_szk
=
(
BLOCK_SIZE_K
*
SPLIT_K
*
k
+
pid_z
*
BLOCK_SIZE_K
)
//
group_size
(
BLOCK_SIZE_K
*
SPLIT_K
*
k
+
pid_z
*
BLOCK_SIZE_K
)
//
group_size
+
offsets_szk
=
offsets_szk
+
(
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
//
group_size
)
tl
.
arange
(
0
,
1
))
offsets_z
=
(
N_8
)
*
offsets_szk
[:,
None
]
+
offsets_bzn
[
None
,
:]
offsets_z
=
(
N
//
8
)
*
offsets_szk
[:,
None
]
+
offsets_zn
[
None
,
:]
masks_zk
=
offsets_szk
<
K
//
group_size
masks_zk
=
offsets_szk
<
K
//
group_size
masks_z
=
masks_zk
[:,
None
]
&
masks_zn
[
None
,
:]
masks_z
=
masks_zk
[:,
None
]
&
masks_
b
zn
[
None
,
:]
zeros_ptrs
=
zeros_ptr
+
offsets_z
zeros_ptrs
=
zeros_ptr
+
offsets_z
zeros
=
tl
.
load
(
zeros_ptrs
,
mask
=
masks_z
,
other
=
0.0
)
zeros
=
tl
.
load
(
zeros_ptrs
,
mask
=
masks_z
)
zeros
=
tl
.
interleave
(
zeros
,
zeros
)
# zeros = tl.interleave(zeros, zeros)
zeros
=
tl
.
interleave
(
zeros
,
zeros
)
# zeros = tl.interleave(zeros, zeros)
zeros
=
tl
.
interleave
(
zeros
,
zeros
)
# zeros = tl.interleave(zeros, zeros)
zeros
=
tl
.
join
(
zeros
,
zeros
).
reshape
(
zeros
.
shape
[:
-
1
]
+
[
2
*
zeros
.
shape
[
-
1
]])
zeros
=
tl
.
join
(
zeros
,
zeros
).
reshape
(
zeros
.
shape
[:
-
1
]
+
[
2
*
zeros
.
shape
[
-
1
]])
zeros
=
tl
.
join
(
zeros
,
zeros
).
reshape
(
zeros
.
shape
[:
-
1
]
+
[
2
*
zeros
.
shape
[
-
1
]])
zeros
=
tl
.
broadcast_to
(
zeros
,
(
BLOCK_SIZE_K
,
BLOCK_SIZE_N
))
zeros
=
tl
.
broadcast_to
(
zeros
,
(
BLOCK_SIZE_K
,
BLOCK_SIZE_N
))
offsets_s
=
N
*
offsets_szk
[:,
None
]
+
offsets_sn
[
None
,
:]
offsets_s
=
N
*
offsets_szk
[:,
None
]
+
offsets_sn
[
None
,
:]
masks_sk
=
offsets_szk
<
K
//
group_size
masks_sk
=
offsets_szk
<
K
//
group_size
masks_s
=
masks_sk
[:,
None
]
&
masks_sn
[
None
,
:]
masks_s
=
masks_sk
[:,
None
]
&
masks_sn
[
None
,
:]
scales_ptrs
=
scales_ptr
+
offsets_s
scales_ptrs
=
scales_ptr
+
offsets_s
scales
=
tl
.
load
(
scales_ptrs
,
mask
=
masks_s
,
other
=
0.0
)
scales
=
tl
.
load
(
scales_ptrs
,
mask
=
masks_s
)
scales
=
tl
.
broadcast_to
(
scales
,
(
BLOCK_SIZE_K
,
BLOCK_SIZE_N
))
scales
=
tl
.
broadcast_to
(
scales
,
(
BLOCK_SIZE_K
,
BLOCK_SIZE_N
))
b
=
(
b
>>
shifts
)
&
0xF
b
=
(
b
>>
shifts
)
&
0xF
...
@@ -202,18 +221,20 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
...
@@ -202,18 +221,20 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
b
=
b
.
to
(
c_ptr
.
type
.
element_ty
)
b
=
b
.
to
(
c_ptr
.
type
.
element_ty
)
# Accumulate results.
# Accumulate results.
accumulator
=
tl
.
dot
(
a
,
b
,
accumulator
,
out_dtype
=
accumulator_dtype
)
accumulator
=
tl
.
dot
(
a
,
b
,
accumulator
,
out_dtype
=
tl
.
float32
)
offsets_k
+=
BLOCK_SIZE_K
*
SPLIT_K
offsets_k
+=
BLOCK_SIZE_K
*
SPLIT_K
a_ptrs
+=
BLOCK_SIZE_K
*
SPLIT_K
a_ptrs
+=
BLOCK_SIZE_K
*
SPLIT_K
b_ptrs
+=
BLOCK_SIZE_K
*
SPLIT_K
*
(
N
//
8
)
b_ptrs
+=
BLOCK_SIZE_K
*
SPLIT_K
*
(
N
_
8
)
c
=
accumulator
.
to
(
c_ptr
.
type
.
element_ty
)
c
=
accumulator
.
to
(
c_ptr
.
type
.
element_ty
)
offs_cm
=
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
c_ptrs
=
c_ptr
+
N
*
offsets_am
[:,
None
]
+
offsets_sn
[
None
,
:]
offs_cn
=
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
)
c_mask
=
masks_am
[:,
None
]
&
masks_sn
[
None
,
:]
c_ptrs
=
c_ptr
+
pid_z
*
N
*
M
+
N
*
offs_cm
[:,
None
]
+
offs_cn
[
None
,
:]
if
SPLIT_K
==
1
:
c_mask
=
(
offs_cm
[:,
None
]
<
M
)
&
(
offs_cn
[
None
,
:]
<
N
)
tl
.
store
(
c_ptrs
,
c
,
mask
=
c_mask
)
tl
.
store
(
c_ptrs
,
c
,
mask
=
c_mask
)
# tl.store(c_ptrs, c)
else
:
tl
.
atomic_add
(
c_ptrs
,
c
,
mask
=
c_mask
)
# qweights - [K , M // 8], int32
# qweights - [K , M // 8], int32
...
@@ -272,9 +293,7 @@ def awq_gemm_triton(input: torch.Tensor,
...
@@ -272,9 +293,7 @@ def awq_gemm_triton(input: torch.Tensor,
scales
:
torch
.
Tensor
,
scales
:
torch
.
Tensor
,
qzeros
:
torch
.
Tensor
,
qzeros
:
torch
.
Tensor
,
split_k_iters
:
int
,
split_k_iters
:
int
,
block_size_m
:
int
=
32
,
config
)
->
torch
.
Tensor
:
block_size_n
:
int
=
32
,
block_size_k
:
int
=
32
)
->
torch
.
Tensor
:
M
,
K
=
input
.
shape
M
,
K
=
input
.
shape
N
=
qweight
.
shape
[
1
]
*
8
N
=
qweight
.
shape
[
1
]
*
8
group_size
=
qweight
.
shape
[
0
]
//
qzeros
.
shape
[
0
]
group_size
=
qweight
.
shape
[
0
]
//
qzeros
.
shape
[
0
]
...
@@ -289,31 +308,27 @@ def awq_gemm_triton(input: torch.Tensor,
...
@@ -289,31 +308,27 @@ def awq_gemm_triton(input: torch.Tensor,
assert
group_size
in
AWQ_TRITON_SUPPORTED_GROUP_SIZES
or
group_size
==
K
assert
group_size
in
AWQ_TRITON_SUPPORTED_GROUP_SIZES
or
group_size
==
K
grid
=
lambda
META
:
(
grid
=
lambda
META
:
(
triton
.
cdiv
(
M
,
META
[
'BLOCK_SIZE_M'
])
*
triton
.
cdiv
(
triton
.
cdiv
(
M
,
META
[
'BLOCK_SIZE_M'
])
*
triton
.
cdiv
(
N
,
META
[
'BLOCK_SIZE_N'
]),
N
,
META
[
'BLOCK_SIZE_N'
]),
META
[
'SPLIT_K'
],
split_k_iters
,
)
)
if
config
is
None
:
result
=
torch
.
zeros
((
split_k_iters
,
M
,
N
),
config
=
{
'BLOCK_SIZE_M'
:
16
,
'BLOCK_SIZE_N'
:
64
,
'BLOCK_SIZE_K'
:
32
,
'GROUP_SIZE_M'
:
8
,
'SPLIT_K'
:
8
}
dtype
=
scales
.
dtype
,
if
M
>
256
:
device
=
input
.
device
)
#print("INFO:this size not found in json.")
config
=
{
'BLOCK_SIZE_M'
:
128
,
'BLOCK_SIZE_N'
:
64
,
'BLOCK_SIZE_K'
:
64
,
'GROUP_SIZE_M'
:
8
,
'SPLIT_K'
:
1
}
result
=
torch
.
zeros
((
M
,
N
),
dtype
=
scales
.
dtype
,
device
=
input
.
device
)
# A = input, B = qweight, C = result
# A = input, B = qweight, C = result
# A = M x K, B = K x N, C = M x N
# A = M x K, B = K x N, C = M x N
awq_gemm_kernel
[
grid
](
input
,
awq_gemm_kernel
[
grid
](
input
,
qweight
,
qweight
,
result
,
result
,
qzeros
,
qzeros
,
scales
,
scales
,
M
,
M
,
N
,
N
,
K
,
K
,
group_size
,
group_size
,
BLOCK_SIZE_M
=
block_size_m
,
**
config
)
BLOCK_SIZE_N
=
block_size_n
,
BLOCK_SIZE_K
=
block_size_k
,
SPLIT_K
=
split_k_iters
)
result
=
result
.
sum
(
0
)
return
result
return
result
vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_BW200.json
0 → 100644
View file @
47bd229c
{
"1536_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_K100_AI.json
0 → 100644
View file @
47bd229c
{
"1536_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
0
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_BW200.json
0 → 100644
View file @
47bd229c
{
"3072_1536"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_K100_AI.json
0 → 100644
View file @
47bd229c
{
"3072_1536"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
0
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_BW200.json
0 → 100644
View file @
47bd229c
{
"4096_512"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_K100_AI.json
0 → 100644
View file @
47bd229c
{
"4096_512"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_BW200.json
0 → 100644
View file @
47bd229c
{
"4608_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
2
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_K100_AI.json
0 → 100644
View file @
47bd229c
{
"4608_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_BW200.json
0 → 100644
View file @
47bd229c
{
"512_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_K100_AI.json
0 → 100644
View file @
47bd229c
{
"512_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_BW200.json
0 → 100644
View file @
47bd229c
{
"576_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_K100_AI.json
0 → 100644
View file @
47bd229c
{
"576_7168"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"128"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_BW200.json
0 → 100644
View file @
47bd229c
{
"7168_2048"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"3"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
8
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"5"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"6"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"7"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"8"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"9"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"10"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"11"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"12"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"13"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"14"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"15"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"16"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
0
},
"32"
:
{
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
4
,
"num_stages"
:
1
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
2
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"512"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
0
,
"num_warps"
:
4
,
"num_ldmatrixes"
:
1
},
"1024"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
8
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"2048"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
},
"4096"
:
{
"BLOCK_SIZE_M"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
32
,
"GROUP_SIZE_M"
:
4
,
"SPLIT_K"
:
1
,
"num_stages"
:
1
,
"num_warps"
:
8
,
"num_ldmatrixes"
:
1
}
}
}
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment