"csrc/vscode:/vscode.git/clone" did not exist on "59449095ab536febe9ff341b2a88a4fed572a70f"
Commit 47bd229c authored by yangql's avatar yangql
Browse files

适配deepseekv3\v2 moe awq的推理支持

parent 4a734b9d
This diff is collapsed.
......@@ -688,7 +688,8 @@ package_data = {
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"benchmarks/*.py",
"model_executor/layers/quantization/configs/w8a8/*.json"
"model_executor/layers/quantization/configs/w8a8/*.json",
"model_executor/layers/quantization/configs/awq/*.json"
]
}
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
}
}
......@@ -1066,7 +1066,7 @@ def get_config_dtype_str(dtype: torch.dtype,
elif use_int8_w8a16:
return "int8_w8a16"
elif use_int4_w4a16:
return "int4_w8a16"
return "int4_w4a16"
elif dtype == torch.float:
# avoiding cases where kernel fails when float32 MoE
# use fp16/bfloat16 configs
......
......@@ -5,7 +5,10 @@ from typing import Any, Dict, List, Optional
import torch
import os
import torch.nn.functional as F
import vllm.envs as envs
import json
import math
from vllm.platforms import current_platform
from vllm import _custom_ops as ops
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod)
......@@ -13,6 +16,57 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
PackedvLLMParameter)
from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
from vllm.logger import init_logger
logger = init_logger(__name__)
triton_configs_dict={}
def get_triton_cache(file_path):
#会将所报错的json文件以字典的形式return出来
if os.path.exists(file_path):
with open(file_path, 'r') as file:
cachedata = json.load(file)
#把所有的cache解析成key:config的形式:[M_N_K]:[config]
for key, value in cachedata.items():
for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}"
configs_value={
'SPLIT_K': int(sub_value["SPLIT_K"]),
'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
'num_stages':int(sub_value['num_stages']),
'num_warps':int(sub_value['num_warps'])
}
if 'num_ldmatrixes' in sub_value:
configs_value["num_ldmatrixes"] = int(sub_value['num_ldmatrixes'])
triton_configs_dict[configs_key]=configs_value
logger.info("%s have loaded!", file_path)
def default_execution(k,n):
configs_key= f"1_{n}_{k}"
if configs_key in triton_configs_dict:
return
script_dir = os.path.dirname(os.path.abspath(__file__))
cache_json_file=f"{script_dir}/configs/awq/"
device_name = current_platform.get_device_name().replace(" ", "_")
filename = f"AWQ_{n}_{k}_{device_name}.json"
file_full_path = os.path.join(cache_json_file, filename)
if os.path.isfile(file_full_path) and file_full_path.endswith(".json"):
# 如果是文件,则添加到列表
get_triton_cache(file_full_path)
return
def getspec_config(M,N,K):
if f"{M}_{N}_{K}" in triton_configs_dict:
return triton_configs_dict[f"{M}_{N}_{K}"]
else:
return None
class AWQShareWorkSpace:
......@@ -111,7 +165,6 @@ class AWQLinearMethod(LinearMethodBase):
self.quant_config = quant_config
self.awqsingleton= AWQShareWorkSpace()
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.AWQ_CK_GEMMBS =int(os.getenv('AWQ_CK_GEMMBS', '20000'))
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
......@@ -178,7 +231,9 @@ class AWQLinearMethod(LinearMethodBase):
layer.register_parameter("qzeros", qzeros)
layer.register_parameter("scales", scales)
layer.register_parameter("zeros_and_scales", zeros_and_scales)
# 加载triton_config
if envs.VLLM_USE_TRITON_AWQ:
default_execution(input_size_per_partition,output_size_per_partition)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer.qweight = torch.nn.Parameter(layer.qweight.data,
......@@ -196,7 +251,9 @@ class AWQLinearMethod(LinearMethodBase):
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
qweight = layer.qweight
zeros_and_scales = layer.zeros_and_scales
qzeros = layer.qzeros
scales = layer.scales
pack_factor = self.quant_config.pack_factor
out_shape = (x.shape[:-1] + (qweight.shape[0] * 1, ))
reshaped_x = x.reshape(-1, x.shape[-1])
......@@ -211,8 +268,15 @@ class AWQLinearMethod(LinearMethodBase):
padding_group=0
else:
padding_group=0
if m <= self.AWQ_CK_GEMMBS:
if envs.VLLM_USE_TRITON_AWQ:
if m>16:
m = 2 ** math.ceil(math.log2(m))
best_config=getspec_config(m,n,k)
out = awq_gemm_triton(reshaped_x, qweight, scales, qzeros, pack_factor, best_config)
out_shape = (x.shape[:-1] + (qweight.shape[1] * 8, ))
else:
out = ops.awq_gemm(reshaped_x,
qweight,
zeros_and_scales,
......@@ -223,15 +287,6 @@ class AWQLinearMethod(LinearMethodBase):
padding_group,
self.awqsingleton.awqworkshapce,
self.awqsingleton.awqworkshapcesize)
else:
#下面是采用rocblas的做法
deqweight=ops.dequant_w4_gemm_colmajor( # shape[n, k/8] ---> [n,k]
qweight,
zeros_and_scales,
k+padding_group*self.quant_config.group_size,
n,
self.quant_config.group_size)
out=F.linear(reshaped_x, deqweight[:,0:k])
if bias is not None:
out.add_(bias)
......
......@@ -44,10 +44,14 @@ def awq_dequantize_kernel(
result_masks = result_masks_y[:, None] & result_masks_x[None, :]
# Load the weights.
iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
iweights = tl.interleave(iweights, iweights)
iweights = tl.interleave(iweights, iweights)
iweights = tl.interleave(iweights, iweights)
iweights = tl.load(qweight_ptr + offsets, masks)
iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
# iweights = tl.interleave(iweights, iweights)
# iweights = tl.interleave(iweights, iweights)
# iweights = tl.interleave(iweights, iweights)
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# that will map given indices to the correct order.
......@@ -73,10 +77,14 @@ def awq_dequantize_kernel(
zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
# Load the zeros.
zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros)
zeros = tl.load(zeros_ptr + zero_offsets, zero_masks)
# zeros = tl.interleave(zeros, zeros)
# zeros = tl.interleave(zeros, zeros)
# zeros = tl.interleave(zeros, zeros)
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
# Unpack and reorder: shift out the correct 4-bit value and mask.
......@@ -93,7 +101,7 @@ def awq_dequantize_kernel(
scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
# Load the scales.
scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
scales = tl.load(scales_ptr + scale_offsets, scale_masks)
scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
# Dequantize.
......@@ -108,19 +116,26 @@ def awq_dequantize_kernel(
def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
group_size, BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
SPLIT_K: tl.constexpr):
GROUP_SIZE_M: tl.constexpr,SPLIT_K: tl.constexpr):
pid = tl.program_id(axis=0)
pid_z = tl.program_id(1)
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
# num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
pid_m = pid // num_pid_n
pid_n = pid % num_pid_n
accumulator_dtype = c_ptr.type.element_ty
if GROUP_SIZE_M == 1:
pid_m = pid // num_pid_n
pid_n = pid % num_pid_n
else:
num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + (pid % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
# accumulator_dtype = c_ptr.type.element_ty
BLOCK_SIZE_N_8 = BLOCK_SIZE_N // 8
N_8 = N // 8
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
# accumulator = tl.arange(0, BLOCK_SIZE_N)
# accumulator = tl.broadcast_to(accumulator[None, :],
......@@ -128,16 +143,16 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
# accumulator = accumulator & 0x0
# accumulator = accumulator.to(accumulator_dtype)
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
dtype=accumulator_dtype)
dtype=tl.float32)
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# that will map given indices to the correct order.
reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
tl.arange(0, 4)[:, None]).reshape(8)
shifts = ((tl.arange(0, 2) * 16)[None, :] +
(tl.arange(0, 4) * 4)[:, None]).reshape(1,8)
# Create the necessary shifts to use to unpack.
shifts = reverse_awq_order_tensor * 4
shifts = tl.broadcast_to(shifts[None, :],
# shifts = reverse_awq_order_tensor * 4
shifts = tl.broadcast_to(shifts,
(BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
......@@ -145,18 +160,15 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
masks_am = offsets_am < M
offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
masks_bn = offsets_bn < N // 8
offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
masks_zn = offsets_zn < N // 8
offsets_bzn = pid_n * (BLOCK_SIZE_N_8) + tl.arange(0, BLOCK_SIZE_N // 8)
masks_bzn = offsets_bzn < N_8
offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
masks_sn = offsets_sn < N
offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
offsets_b = (N_8) * offsets_k[:, None] + offsets_bzn[None, :]
a_ptrs = a_ptr + offsets_a
b_ptrs = b_ptr + offsets_b
......@@ -167,33 +179,40 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
masks_k = offsets_k < K
masks_a = masks_am[:, None] & masks_k[None, :]
a = tl.load(a_ptrs, mask=masks_a, other=0.0)
a = tl.load(a_ptrs, mask=masks_a)
masks_b = masks_k[:, None] & masks_bn[None, :]
b = tl.load(b_ptrs, mask=masks_b, other=0.0)
b = tl.interleave(b, b)
b = tl.interleave(b, b)
b = tl.interleave(b, b)
masks_b = masks_k[:, None] & masks_bzn[None, :]
b = tl.load(b_ptrs, mask=masks_b)
# b = tl.interleave(b, b)
# b = tl.interleave(b, b)
# b = tl.interleave(b, b)
b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
# Dequantize b.
offsets_szk = (
(BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size +
tl.arange(0, 1))
offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
offsets_szk = (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size
offsets_szk = offsets_szk + (tl.arange(0,BLOCK_SIZE_K) // group_size)
offsets_z = (N_8) * offsets_szk[:, None] + offsets_bzn[None, :]
masks_zk = offsets_szk < K // group_size
masks_z = masks_zk[:, None] & masks_zn[None, :]
masks_z = masks_zk[:, None] & masks_bzn[None, :]
zeros_ptrs = zeros_ptr + offsets_z
zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros)
zeros = tl.load(zeros_ptrs, mask=masks_z)
# zeros = tl.interleave(zeros, zeros)
# zeros = tl.interleave(zeros, zeros)
# zeros = tl.interleave(zeros, zeros)
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
masks_sk = offsets_szk < K // group_size
masks_s = masks_sk[:, None] & masks_sn[None, :]
scales_ptrs = scales_ptr + offsets_s
scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
scales = tl.load(scales_ptrs, mask=masks_s)
scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
b = (b >> shifts) & 0xF
......@@ -202,18 +221,20 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
b = b.to(c_ptr.type.element_ty)
# Accumulate results.
accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32)
offsets_k += BLOCK_SIZE_K * SPLIT_K
a_ptrs += BLOCK_SIZE_K * SPLIT_K
b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N_8)
c = accumulator.to(c_ptr.type.element_ty)
offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
tl.store(c_ptrs, c, mask=c_mask)
c_ptrs = c_ptr + N * offsets_am[:, None] + offsets_sn[None, :]
c_mask = masks_am[:, None] & masks_sn[None, :]
if SPLIT_K == 1:
tl.store(c_ptrs, c, mask=c_mask)
# tl.store(c_ptrs, c)
else:
tl.atomic_add(c_ptrs, c, mask=c_mask)
# qweights - [K , M // 8], int32
......@@ -272,9 +293,7 @@ def awq_gemm_triton(input: torch.Tensor,
scales: torch.Tensor,
qzeros: torch.Tensor,
split_k_iters: int,
block_size_m: int = 32,
block_size_n: int = 32,
block_size_k: int = 32) -> torch.Tensor:
config) -> torch.Tensor:
M, K = input.shape
N = qweight.shape[1] * 8
group_size = qweight.shape[0] // qzeros.shape[0]
......@@ -289,31 +308,27 @@ def awq_gemm_triton(input: torch.Tensor,
assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
grid = lambda META: (
triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
N, META['BLOCK_SIZE_N']),
split_k_iters,
triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
META['SPLIT_K'],
)
result = torch.zeros((split_k_iters, M, N),
dtype=scales.dtype,
device=input.device)
if config is None:
config= {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8,'SPLIT_K': 8}
if M >256:
#print("INFO:this size not found in json.")
config= {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8,'SPLIT_K': 1}
result = torch.zeros((M, N), dtype=scales.dtype, device=input.device)
# A = input, B = qweight, C = result
# A = M x K, B = K x N, C = M x N
awq_gemm_kernel[grid](input,
qweight,
result,
qzeros,
scales,
M,
N,
K,
group_size,
BLOCK_SIZE_M=block_size_m,
BLOCK_SIZE_N=block_size_n,
BLOCK_SIZE_K=block_size_k,
SPLIT_K=split_k_iters)
result = result.sum(0)
qweight,
result,
qzeros,
scales,
M,
N,
K,
group_size,
**config)
return result
{
"1536_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"1536_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"3072_1536": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"3072_1536": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4096_512": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4096_512": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4608_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4608_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"512_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"512_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"576_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"576_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"7168_2048": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment