Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1933 additions
and
0 deletions
+1933
-0
vllm/model_executor/layers/quantization/neuron_quant.py
vllm/model_executor/layers/quantization/neuron_quant.py
+1
-0
vllm/model_executor/layers/quantization/ptpc_fp8.py
vllm/model_executor/layers/quantization/ptpc_fp8.py
+125
-0
vllm/model_executor/layers/quantization/qqq.py
vllm/model_executor/layers/quantization/qqq.py
+1
-0
vllm/model_executor/layers/quantization/quark/quark.py
vllm/model_executor/layers/quantization/quark/quark.py
+1
-0
vllm/model_executor/layers/quantization/tpu_int8.py
vllm/model_executor/layers/quantization/tpu_int8.py
+1
-0
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
...ame=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+0
-0
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
...ame=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+0
-0
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
...ame=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+0
-0
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
...ame=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
+0
-0
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
...Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+164
-0
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
vllm/model_executor/layers/quantization/neuron_quant.py
View file @
ec5e299c
...
@@ -20,6 +20,7 @@ class NeuronQuantConfig(QuantizationConfig):
...
@@ -20,6 +20,7 @@ class NeuronQuantConfig(QuantizationConfig):
dequant_dtype
:
str
=
"f16"
,
dequant_dtype
:
str
=
"f16"
,
quantize_method
:
str
=
"vector_dynamic"
,
quantize_method
:
str
=
"vector_dynamic"
,
)
->
None
:
)
->
None
:
super
().
__init__
()
self
.
quant_dtype
=
os
.
getenv
(
"NEURON_QUANT_DTYPE"
,
"s8"
)
self
.
quant_dtype
=
os
.
getenv
(
"NEURON_QUANT_DTYPE"
,
"s8"
)
if
self
.
quant_dtype
not
in
SUPPORTED_QUANT_DTYPE_LIST
:
if
self
.
quant_dtype
not
in
SUPPORTED_QUANT_DTYPE_LIST
:
raise
ValueError
(
raise
ValueError
(
...
...
vllm/model_executor/layers/quantization/ptpc_fp8.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
from
torch.nn.parameter
import
Parameter
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
UnquantizedLinearMethod
)
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizeMethodBase
)
from
vllm.model_executor.layers.quantization.fp8
import
(
Fp8Config
,
Fp8KVCacheMethod
,
Fp8LinearMethod
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
is_layer_skipped
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
apply_fp8_linear
)
from
vllm.platforms
import
current_platform
ACTIVATION_SCHEMES
=
[
"static"
,
"dynamic"
]
logger
=
init_logger
(
__name__
)
class
PTPCFp8Config
(
Fp8Config
):
"""Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
def
__init__
(
self
,
activation_scheme
:
str
=
"dynamic"
,
ignored_layers
:
Optional
[
List
[
str
]]
=
None
,
)
->
None
:
if
not
current_platform
.
is_rocm
():
raise
ValueError
(
"ptpc_fp8 quantization is supported only on ROCm."
)
if
not
current_platform
.
has_device_capability
(
94
):
raise
ValueError
(
"ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."
# noqa: E501
)
if
activation_scheme
==
"static"
:
raise
ValueError
(
"ptpc_fp8 as of now only support dynamic quantization."
)
super
().
__init__
(
is_checkpoint_fp8_serialized
=
False
,
activation_scheme
=
activation_scheme
,
ignored_layers
=
ignored_layers
)
@
classmethod
def
get_name
(
cls
)
->
str
:
return
"ptpc_fp8"
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"PTPCFp8Config"
:
activation_scheme
=
cls
.
get_from_keys
(
config
,
[
"activation_scheme"
])
ignored_layers
=
cls
.
get_from_keys_or
(
config
,
[
"ignored_layers"
],
None
)
return
cls
(
activation_scheme
=
activation_scheme
,
ignored_layers
=
ignored_layers
)
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
from
vllm.attention.layer
import
Attention
# Avoid circular import
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped
(
prefix
,
self
.
ignored_layers
):
return
UnquantizedLinearMethod
()
return
PTPCFp8LinearMethod
(
self
)
elif
isinstance
(
layer
,
Attention
):
return
Fp8KVCacheMethod
(
self
)
return
None
class
PTPCFp8LinearMethod
(
Fp8LinearMethod
):
"""Linear method for Per-Token and Per-Channel FP8 Quantization.
Only supports loading quantized BF16 model checkpoints with dynamic
activation scaling. To load FP16 model checkpoints, user must specify
to convert the FP16 model weight loading into BF16.
The weight scaling factor will be initialized after
the model weights are loaded.
Limitations:
1. Only support float8_e4m3fnuz data type due to the limitation of
torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041)
Args:
quant_config: The quantization config.
"""
def
__init__
(
self
,
quant_config
:
PTPCFp8Config
):
super
().
__init__
(
quant_config
=
quant_config
)
# Force weight quantization
self
.
quant_config
.
is_checkpoint_fp8_serialized
=
False
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
.
weight
=
torch
.
nn
.
Parameter
(
layer
.
weight
.
data
,
requires_grad
=
False
)
assert
layer
.
weight
.
data
.
dtype
==
torch
.
bfloat16
,
\
f
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16.
{
str
(
layer
.
weight
.
data
.
dtype
)
}
is specified."
# noqa: E501
# Quantize the weights.
qweight
,
weight_scale
=
ops
.
scaled_fp8_quant
(
layer
.
weight
,
scale
=
None
,
use_per_token_if_dynamic
=
True
)
# Update the layer with the new values.
layer
.
weight
=
Parameter
(
qweight
.
t
(),
requires_grad
=
False
)
# Pretranspose the weight
layer
.
weight_scale
=
Parameter
(
weight_scale
,
requires_grad
=
False
)
layer
.
input_scale
=
None
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
return
apply_fp8_linear
(
input
=
x
,
weight
=
layer
.
weight
,
weight_scale
=
layer
.
weight_scale
,
input_scale
=
None
,
input_scale_ub
=
None
,
bias
=
bias
,
cutlass_fp8_supported
=
False
,
use_per_token_if_dynamic
=
True
)
vllm/model_executor/layers/quantization/qqq.py
View file @
ec5e299c
...
@@ -39,6 +39,7 @@ class QQQConfig(QuantizationConfig):
...
@@ -39,6 +39,7 @@ class QQQConfig(QuantizationConfig):
group_size
:
int
,
group_size
:
int
,
is_sym
:
bool
=
True
,
is_sym
:
bool
=
True
,
)
->
None
:
)
->
None
:
super
().
__init__
()
self
.
weight_bits
=
weight_bits
self
.
weight_bits
=
weight_bits
self
.
group_size
=
group_size
self
.
group_size
=
group_size
self
.
is_sym
=
is_sym
self
.
is_sym
=
is_sym
...
...
vllm/model_executor/layers/quantization/quark/quark.py
View file @
ec5e299c
...
@@ -30,6 +30,7 @@ class QuarkConfig(QuantizationConfig):
...
@@ -30,6 +30,7 @@ class QuarkConfig(QuantizationConfig):
kv_cache_group
:
Optional
[
List
[
str
]]
=
None
,
kv_cache_group
:
Optional
[
List
[
str
]]
=
None
,
kv_cache_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
kv_cache_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
pack_method
:
str
=
"reorder"
):
pack_method
:
str
=
"reorder"
):
super
().
__init__
()
if
kv_cache_group
is
None
:
if
kv_cache_group
is
None
:
kv_cache_group
=
[]
kv_cache_group
=
[]
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
...
...
vllm/model_executor/layers/quantization/tpu_int8.py
View file @
ec5e299c
...
@@ -21,6 +21,7 @@ class Int8TpuConfig(QuantizationConfig):
...
@@ -21,6 +21,7 @@ class Int8TpuConfig(QuantizationConfig):
self
,
self
,
activation_scheme
:
str
=
"none"
,
activation_scheme
:
str
=
"none"
,
)
->
None
:
)
->
None
:
super
().
__init__
()
if
activation_scheme
not
in
ACTIVATION_SCHEMES
:
if
activation_scheme
not
in
ACTIVATION_SCHEMES
:
raise
ValueError
(
raise
ValueError
(
f
"Unsupported activation scheme
{
activation_scheme
}
"
)
f
"Unsupported activation scheme
{
activation_scheme
}
"
)
...
...
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,
128].json
→
vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
View file @
ec5e299c
File moved
vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,
128].json
→
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
View file @
ec5e299c
File moved
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,
128].json
→
vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
View file @
ec5e299c
File moved
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,
128].json
→
vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
View file @
ec5e299c
File moved
vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
1
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
0 → 100644
View file @
ec5e299c
{
"1"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"8"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"16"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"24"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"32"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"48"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"64"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"96"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
32
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
8
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"128"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"256"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"512"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
32
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1024"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"1536"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"GROUP_SIZE_M"
:
16
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"2048"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"3072"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
},
"4096"
:
{
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"GROUP_SIZE_M"
:
32
,
"kpack"
:
1
,
"matrix_instr_nonkdim"
:
16
,
"num_warps"
:
4
}
}
\ No newline at end of file
Prev
1
…
18
19
20
21
22
23
24
25
26
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment