Commit 47bd229c authored by yangql's avatar yangql
Browse files

适配deepseekv3\v2 moe awq的推理支持

parent 4a734b9d
# SPDX-License-Identifier: Apache-2.0
import argparse
import time
from datetime import datetime
from itertools import product
from typing import Any, Dict, List, Tuple, TypedDict
import ray
import torch
import triton
from ray.experimental.tqdm_ray import tqdm
from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
) else torch.float8_e4m3fn
class BenchmarkConfig(TypedDict):
BLOCK_SIZE_M: int
BLOCK_SIZE_N: int
BLOCK_SIZE_K: int
GROUP_SIZE_M: int
num_warps: int
num_stages: int
num_ldmatrixes: Optional[int]
def benchmark_config(
config: BenchmarkConfig,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
group_size: int,
num_iters: int = 100,
nn_moe: Optional[bool] = False
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int8_w8a16:
if not nn_moe:
w1 = torch.randint(-127,
127, (
num_experts,
shard_intermediate_size,
hidden_size,
),
dtype=torch.int8)
w2 = torch.randint(-127,
127, (
num_experts,
hidden_size,
shard_intermediate_size // 2,
),
dtype=torch.int8)
else:
w1 = torch.randint(-127,
127, (
num_experts,
hidden_size,
shard_intermediate_size
),
dtype=torch.int8)
w2 = torch.randint(-127,
127, (
num_experts,
shard_intermediate_size // 2,
hidden_size
),
dtype=torch.int8)
if use_int4_w4a16:
w1 = torch.randint(0,
255, (
num_experts,
shard_intermediate_size,
hidden_size // 2,
),
dtype=torch.uint8)
w2 = torch.randint(0,
255, (
num_experts,
hidden_size,
shard_intermediate_size // 4,
),
dtype=torch.uint8)
else:
if not nn_moe:
w1 = torch.randn(num_experts,
shard_intermediate_size,
hidden_size,
dtype=init_dtype)
w2 = torch.randn(num_experts,
hidden_size,
shard_intermediate_size // 2,
dtype=init_dtype)
else:
w1 = torch.randn(num_experts,
hidden_size,
shard_intermediate_size,
dtype=init_dtype)
w2 = torch.randn(num_experts,
shard_intermediate_size // 2,
hidden_size,
dtype=init_dtype)
gating_output = torch.randn(num_iters,
num_tokens,
num_experts,
dtype=torch.float32)
w1_scale = None
w2_scale = None
a1_scale = None
a2_scale = None
w1_zp = None
w2_zp = None
block_shape = None
if use_int8_w8a16:
w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
dtype=torch.float32)
w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
if use_fp8_w8a8:
w1_scale = torch.randn(num_experts, dtype=torch.float32)
w2_scale = torch.randn(num_experts, dtype=torch.float32)
a1_scale = torch.randn(1, dtype=torch.float32)
a2_scale = torch.randn(1, dtype=torch.float32)
w1 = w1.to(FP8_DTYPE)
w2 = w2.to(FP8_DTYPE)
if use_int4_w4a16:
w1_scale = torch.randn((num_experts, shard_intermediate_size, hidden_size // (group_size)),
dtype=torch.float16)
w2_scale = torch.randn((num_experts, hidden_size,shard_intermediate_size // (2*group_size)),
dtype=torch.float16)
w1_zp = torch.randint(0,
255, (
num_experts,
shard_intermediate_size // 2,
hidden_size // (group_size),
),
dtype=torch.uint8)
w2_zp = torch.randint(0,
255, (
num_experts,
hidden_size // 2,
shard_intermediate_size // (2*group_size),
),
dtype=torch.uint8)
nn_moe = False
block_shape=[0, group_size]
input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
print(f"shape: {x.shape[0]} | config: {config}")
def prepare(i: int):
input_gating.copy_(gating_output[i])
def run():
from vllm.model_executor.layers.fused_moe import override_config
with override_config(config):
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
w1_zp=w1_zp,
w2_zp=w2_zp,
a1_scale=a1_scale,
a2_scale=a2_scale,
use_nn_moe=nn_moe,
block_shape=block_shape,
)
# JIT compilation & warmup
run()
torch.cuda.synchronize()
# Capture 10 invocations with CUDA graph
# graph = torch.cuda.CUDAGraph()
# with torch.cuda.graph(graph):
# for _ in range(10):
# run()
# torch.cuda.synchronize()
# Warmup
for _ in range(5):
# graph.replay()
run()
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies: List[float] = []
for i in range(num_iters):
prepare(i)
torch.cuda.synchronize()
start_event.record()
# graph.replay()
run()
end_event.record()
end_event.synchronize()
latencies.append(start_event.elapsed_time(end_event))
avg = sum(latencies) / (num_iters * 10) * 1000 # us
print(f"avg: {avg}")
# graph.reset()
return avg
def get_rocm_tuning_space(use_fp16, use_int4_w4a16, nn_moe: Optional[bool] = False):
if use_int4_w4a16:
block_m_range = [16, 32, 64]
block_n_range = [32, 64, 128]
block_k_range = [16, 32, 64]
num_warps_range = [1, 2, 4, 8]
group_m_range = [1, 4, 8, 16]
num_stage_range = [2, 4]
num_ldmatrixes = [0]
param_ranges = {
"BLOCK_SIZE_M": block_m_range,
"BLOCK_SIZE_N": block_n_range,
"BLOCK_SIZE_K": block_k_range,
"GROUP_SIZE_M": group_m_range,
"num_warps": num_warps_range,
"num_stages": num_stage_range,
"num_ldmatrixes": num_ldmatrixes,
}
return param_ranges
block_mn_range = [16, 32, 64, 128, 256]
block_k_range = [16, 32, 64, 128, 256]
if not use_fp16:
block_k_range.remove(16) # BLOCK_K=16 not supported for fp8
num_warps_range = [1, 2, 4, 8]
group_m_range = [1, 4, 8, 16, 32]
num_stage_range = [2]
waves_per_eu_range = [0]
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
kpack_range = [1, 2] if use_fp16 else []
param_ranges = {
"BLOCK_SIZE_M": block_mn_range,
"BLOCK_SIZE_N": block_mn_range,
"BLOCK_SIZE_K": block_k_range,
"GROUP_SIZE_M": group_m_range,
"num_warps": num_warps_range,
"num_stages": num_stage_range,
"waves_per_eu": waves_per_eu_range,
}
if nn_moe:
param_ranges["num_ldmatrixes"] = 1
if use_fp16:
param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
param_ranges["kpack"] = kpack_range
return param_ranges
def get_configs_compute_bound(use_fp16, use_int4_w4a16, nn_moe: Optional[bool] = False) -> List[Dict[str, int]]:
configs: List[BenchmarkConfig] = []
if current_platform.is_rocm():
param_ranges = get_rocm_tuning_space(use_fp16, use_int4_w4a16, nn_moe)
else:
# Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to
# prune the search space.
block_m_range = [16, 32, 64, 128, 256]
block_n_range = [32, 64, 128, 256]
block_k_range = [64, 128, 256]
num_warps_range = [4, 8]
group_m_range = [1, 16, 32, 64]
num_stage_range = [2, 3, 4, 5]
param_ranges = {
"BLOCK_SIZE_M": block_m_range,
"BLOCK_SIZE_N": block_n_range,
"BLOCK_SIZE_K": block_k_range,
"GROUP_SIZE_M": group_m_range,
"num_warps": num_warps_range,
"num_stages": num_stage_range,
}
keys, values = zip(*param_ranges.items())
for config_values in product(*values):
config = dict(zip(keys, config_values))
configs.append(config)
return configs
def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
search_space, is_fp16):
N1, K1 = shard_intermediate_size, hidden_size
N2, K2 = hidden_size, shard_intermediate_size // 2
pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
is_fp16)
pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
is_fp16)
search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
return search_space
# The following code is inspired by ROCm/Triton GEMM tuning script:
# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
def prune_rocm_configs(M, N, K, configs, is_fp16=True):
pruned_configs = []
elemBytes_a = 2 if is_fp16 else 1
elemBytes_b = 2 if is_fp16 else 1
mfma = 16 if M < 32 or N < 32 else 32
# TODO (zhanglx): figure out the boundary between large and small gemms
large_gemm = False
if M >= 2048 and N >= 2048:
large_gemm = True
for config in configs:
BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
num_warps = config.get("num_warps")
if is_fp16:
matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
if matrix_instr_nonkdim > mfma:
continue
if mfma == 4 and BLOCK_SIZE_K < 64:
continue
# some layouts could not work properly in case
# number elements per thread is less 1
if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
continue
SPLIT_K = config.get("SPLIT_K", 1)
GROUP_M = config.get("GROUP_SIZE_M")
if is_fp16:
if (matrix_instr_nonkdim > BLOCK_SIZE_M
or matrix_instr_nonkdim > BLOCK_SIZE_N):
continue
if (matrix_instr_nonkdim >= M
and matrix_instr_nonkdim != BLOCK_SIZE_M):
continue
if (matrix_instr_nonkdim >= N
and matrix_instr_nonkdim != BLOCK_SIZE_N):
continue
# Skip BLOCK_SIZE that is too large compare to M/N
# unless BLOCK_SIZE is already small enough
if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
continue
if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
continue
# skip large split_k when not necessary
if SPLIT_K != 1 and not need_split_k(M, N, K):
continue
# skip split_k that leads to EVEN_K = false
leap = SPLIT_K * BLOCK_SIZE_K
modv = K % leap
if modv != 0:
continue
# skip large GROUP_M
if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
continue
# out of shared memory resource
# TODO (zhanglx): This does not consider the LDS usage in the epilogue
LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
if LDS > 65536:
continue
# Skip small block sizes and num_warps for large gemm
# For fp16 and f8, we want to only use BLOCK_SIZE >= 64
if large_gemm:
if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
continue
if BLOCK_SIZE_K < 64:
continue
if num_warps < 4:
continue
pruned_configs.append(config)
return pruned_configs
def need_split_k(SIZE_M, SIZE_N, SIZE_K):
return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
def merge_unique_dicts(list1, list2):
result = []
combined_list = list1.copy()
combined_list.extend(list2)
for dictionary in combined_list:
if dictionary not in result:
result.append(dictionary)
return result
@ray.remote(num_gpus=1)
class BenchmarkWorker:
def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
current_platform.seed_everything(seed)
self.seed = seed
# Get the device ID to allocate tensors and kernels
# on the respective GPU. This is required for Ray to work
# correctly with multi-GPU tuning on the ROCm platform.
self.device_id = int(ray.get_gpu_ids()[0])
def benchmark(
self,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
group_size:int,
) -> Tuple[Dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int4_w4a16=use_int4_w4a16,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
config_shard_intermediate_size = shard_intermediate_size
if use_int4_w4a16:
config_shard_intermediate_size = shard_intermediate_size // 2
op_config = get_moe_configs(num_experts, config_shard_intermediate_size // 2,
dtype_str)
if op_config is None:
config = get_default_config(num_tokens,
num_experts,
config_shard_intermediate_size,
hidden_size,
topk,
dtype_str,
is_marlin=False)
else:
config = op_config[min(op_config.keys(),
key=lambda x: abs(x - num_tokens))]
kernel_time = benchmark_config(config, num_tokens, num_experts,
shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8,
use_int8_w8a16,use_int4_w4a16,group_size)
return config, kernel_time
def tune(
self,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
topk: int,
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
use_int4_w4a16: bool,
group_size: int,
search_space: List[Dict[str, int]],
nn_moe: Optional[bool] = False
) -> Dict[str, int]:
best_config = None
best_time = float("inf")
if current_platform.is_rocm():
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
search_space = prune_rocm_search_space(num_tokens,
shard_intermediate_size,
hidden_size, search_space,
is_fp16)
with torch.cuda.device(self.device_id):
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
use_int4_w4a16,
group_size,
num_iters=20,
nn_moe=nn_moe)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
if kernel_time < best_time:
best_time = kernel_time
best_config = config
now = datetime.now()
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
assert best_config is not None
return best_config
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
if "num_ldmatrixes" not in config:
return {
"BLOCK_SIZE_M":
config["BLOCK_SIZE_M"],
"BLOCK_SIZE_N":
config["BLOCK_SIZE_N"],
"BLOCK_SIZE_K":
config["BLOCK_SIZE_K"],
"GROUP_SIZE_M":
config["GROUP_SIZE_M"],
"num_warps":
config["num_warps"],
"num_stages":
config["num_stages"],
**({
"waves_per_eu": config["waves_per_eu"]
} if "waves_per_eu" in config else {}),
**({
"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
} if "matrix_instr_nonkdim" in config else {}),
**({
"kpack": config["kpack"]
} if "kpack" in config else {}),
}
else:
return {
"BLOCK_SIZE_M":
config["BLOCK_SIZE_M"],
"BLOCK_SIZE_N":
config["BLOCK_SIZE_N"],
"BLOCK_SIZE_K":
config["BLOCK_SIZE_K"],
"GROUP_SIZE_M":
config["GROUP_SIZE_M"],
"num_warps":
config["num_warps"],
"num_stages":
config["num_stages"],
"num_ldmatrixes":
config["num_ldmatrixes"],
**({
"waves_per_eu": config["waves_per_eu"]
} if "waves_per_eu" in config else {}),
**({
"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
} if "matrix_instr_nonkdim" in config else {}),
**({
"kpack": config["kpack"]
} if "kpack" in config else {}),
}
def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
shard_intermediate_size: int, hidden_size: int, topk: int,
dtype: torch.dtype, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int4_w4a16: bool, use_nn_moe: Optional[bool] = False) -> None:
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_int4_w4a16=use_int4_w4a16,
use_fp8_w8a8=use_fp8_w8a8)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
dtype_str, use_nn_moe=use_nn_moe)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
json.dump(configs, f, indent=4)
f.write("\n")
def main(args: argparse.Namespace):
print(args)
config = AutoConfig.from_pretrained(
args.model, trust_remote_code=args.trust_remote_code)
group_size = None
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM":
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
if config.quantization_config['quant_method'] == "awq":
group_size = config.quantization_config["group_size"]
else:
# Default: Mixtral.
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_int4_w4a16 = args.dtype == "int4_w4a16"
if args.batch_size is None:
batch_sizes = [
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
2048, 3072, 4096
]
else:
batch_sizes = [args.batch_size]
ray.init(address=None,
ignore_reinit_error=True,
num_gpus=1)
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
outputs = []
worker_idx = 0
for input_args in inputs:
worker = workers[worker_idx]
worker_method = getattr(worker, method)
output = worker_method.remote(*input_args)
outputs.append(output)
worker_idx = (worker_idx + 1) % num_gpus
return ray.get(outputs)
if args.tune:
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
search_space = get_configs_compute_bound(is_fp16, use_int4_w4a16, args.nn_moe)
print(f"Start tuning over {len(search_space)} configurations...")
start = time.time()
configs = _distribute(
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, group_size, search_space, args.nn_moe)
for batch_size in batch_sizes])
best_configs = {
M: sort_config(config)
for M, config in zip(batch_sizes, configs)
}
if use_int4_w4a16:
save_configs(best_configs, E, shard_intermediate_size // 2, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, use_nn_moe=args.nn_moe)
else:
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_nn_moe=args.nn_moe)
end = time.time()
print(f"Tuning took {end - start:.2f} seconds")
else:
outputs = _distribute(
"benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, group_size)
for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
print(f"Batch size: {batch_size}, config: {config}")
print(f"Kernel time: {kernel_time:.2f} us")
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--model",
type=str,
default="/home/yang/llm-models/vllm-awq-models/DeepSeek-R1-AWQ/")
parser.add_argument("--tp-size",
"-tp",
"--tensor-parallel-size",
type=int,
default=8)
parser.add_argument("--dtype",
type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
default="int4_w4a16")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true",default=False)
parser.add_argument("--nn_moe", type=bool, default=False)
parser.add_argument("--trust-remote-code", action="store_true", default=True)
args = parser.parse_args()
main(args)
...@@ -688,7 +688,8 @@ package_data = { ...@@ -688,7 +688,8 @@ package_data = {
"model_executor/layers/fused_moe/configs/*.json", "model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json", "model_executor/layers/quantization/utils/configs/*.json",
"benchmarks/*.py", "benchmarks/*.py",
"model_executor/layers/quantization/configs/w8a8/*.json" "model_executor/layers/quantization/configs/w8a8/*.json",
"model_executor/layers/quantization/configs/awq/*.json"
] ]
} }
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
}
}
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 0
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 0
}
}
...@@ -1066,7 +1066,7 @@ def get_config_dtype_str(dtype: torch.dtype, ...@@ -1066,7 +1066,7 @@ def get_config_dtype_str(dtype: torch.dtype,
elif use_int8_w8a16: elif use_int8_w8a16:
return "int8_w8a16" return "int8_w8a16"
elif use_int4_w4a16: elif use_int4_w4a16:
return "int4_w8a16" return "int4_w4a16"
elif dtype == torch.float: elif dtype == torch.float:
# avoiding cases where kernel fails when float32 MoE # avoiding cases where kernel fails when float32 MoE
# use fp16/bfloat16 configs # use fp16/bfloat16 configs
......
...@@ -5,7 +5,10 @@ from typing import Any, Dict, List, Optional ...@@ -5,7 +5,10 @@ from typing import Any, Dict, List, Optional
import torch import torch
import os import os
import torch.nn.functional as F import torch.nn.functional as F
import vllm.envs as envs
import json
import math
from vllm.platforms import current_platform
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
UnquantizedLinearMethod) UnquantizedLinearMethod)
...@@ -13,6 +16,57 @@ from vllm.model_executor.layers.quantization.base_config import ( ...@@ -13,6 +16,57 @@ from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig) QuantizationConfig)
from vllm.model_executor.parameter import (GroupQuantScaleParameter, from vllm.model_executor.parameter import (GroupQuantScaleParameter,
PackedvLLMParameter) PackedvLLMParameter)
from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
from vllm.logger import init_logger
logger = init_logger(__name__)
triton_configs_dict={}
def get_triton_cache(file_path):
#会将所报错的json文件以字典的形式return出来
if os.path.exists(file_path):
with open(file_path, 'r') as file:
cachedata = json.load(file)
#把所有的cache解析成key:config的形式:[M_N_K]:[config]
for key, value in cachedata.items():
for sub_key, sub_value in value.items():
configs_key= f"{sub_key}_{key}"
configs_value={
'SPLIT_K': int(sub_value["SPLIT_K"]),
'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
'num_stages':int(sub_value['num_stages']),
'num_warps':int(sub_value['num_warps'])
}
if 'num_ldmatrixes' in sub_value:
configs_value["num_ldmatrixes"] = int(sub_value['num_ldmatrixes'])
triton_configs_dict[configs_key]=configs_value
logger.info("%s have loaded!", file_path)
def default_execution(k,n):
configs_key= f"1_{n}_{k}"
if configs_key in triton_configs_dict:
return
script_dir = os.path.dirname(os.path.abspath(__file__))
cache_json_file=f"{script_dir}/configs/awq/"
device_name = current_platform.get_device_name().replace(" ", "_")
filename = f"AWQ_{n}_{k}_{device_name}.json"
file_full_path = os.path.join(cache_json_file, filename)
if os.path.isfile(file_full_path) and file_full_path.endswith(".json"):
# 如果是文件,则添加到列表
get_triton_cache(file_full_path)
return
def getspec_config(M,N,K):
if f"{M}_{N}_{K}" in triton_configs_dict:
return triton_configs_dict[f"{M}_{N}_{K}"]
else:
return None
class AWQShareWorkSpace: class AWQShareWorkSpace:
...@@ -111,7 +165,6 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -111,7 +165,6 @@ class AWQLinearMethod(LinearMethodBase):
self.quant_config = quant_config self.quant_config = quant_config
self.awqsingleton= AWQShareWorkSpace() self.awqsingleton= AWQShareWorkSpace()
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1' self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
self.AWQ_CK_GEMMBS =int(os.getenv('AWQ_CK_GEMMBS', '20000'))
def create_weights(self, layer: torch.nn.Module, def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int, input_size_per_partition: int,
...@@ -178,7 +231,9 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -178,7 +231,9 @@ class AWQLinearMethod(LinearMethodBase):
layer.register_parameter("qzeros", qzeros) layer.register_parameter("qzeros", qzeros)
layer.register_parameter("scales", scales) layer.register_parameter("scales", scales)
layer.register_parameter("zeros_and_scales", zeros_and_scales) layer.register_parameter("zeros_and_scales", zeros_and_scales)
# 加载triton_config
if envs.VLLM_USE_TRITON_AWQ:
default_execution(input_size_per_partition,output_size_per_partition)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer.qweight = torch.nn.Parameter(layer.qweight.data, layer.qweight = torch.nn.Parameter(layer.qweight.data,
...@@ -196,7 +251,9 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -196,7 +251,9 @@ class AWQLinearMethod(LinearMethodBase):
bias: Optional[torch.Tensor] = None) -> torch.Tensor: bias: Optional[torch.Tensor] = None) -> torch.Tensor:
qweight = layer.qweight qweight = layer.qweight
zeros_and_scales = layer.zeros_and_scales zeros_and_scales = layer.zeros_and_scales
qzeros = layer.qzeros
scales = layer.scales
pack_factor = self.quant_config.pack_factor
out_shape = (x.shape[:-1] + (qweight.shape[0] * 1, )) out_shape = (x.shape[:-1] + (qweight.shape[0] * 1, ))
reshaped_x = x.reshape(-1, x.shape[-1]) reshaped_x = x.reshape(-1, x.shape[-1])
...@@ -212,7 +269,14 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -212,7 +269,14 @@ class AWQLinearMethod(LinearMethodBase):
else: else:
padding_group=0 padding_group=0
if m <= self.AWQ_CK_GEMMBS: if envs.VLLM_USE_TRITON_AWQ:
if m>16:
m = 2 ** math.ceil(math.log2(m))
best_config=getspec_config(m,n,k)
out = awq_gemm_triton(reshaped_x, qweight, scales, qzeros, pack_factor, best_config)
out_shape = (x.shape[:-1] + (qweight.shape[1] * 8, ))
else:
out = ops.awq_gemm(reshaped_x, out = ops.awq_gemm(reshaped_x,
qweight, qweight,
zeros_and_scales, zeros_and_scales,
...@@ -223,15 +287,6 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -223,15 +287,6 @@ class AWQLinearMethod(LinearMethodBase):
padding_group, padding_group,
self.awqsingleton.awqworkshapce, self.awqsingleton.awqworkshapce,
self.awqsingleton.awqworkshapcesize) self.awqsingleton.awqworkshapcesize)
else:
#下面是采用rocblas的做法
deqweight=ops.dequant_w4_gemm_colmajor( # shape[n, k/8] ---> [n,k]
qweight,
zeros_and_scales,
k+padding_group*self.quant_config.group_size,
n,
self.quant_config.group_size)
out=F.linear(reshaped_x, deqweight[:,0:k])
if bias is not None: if bias is not None:
out.add_(bias) out.add_(bias)
......
...@@ -44,10 +44,14 @@ def awq_dequantize_kernel( ...@@ -44,10 +44,14 @@ def awq_dequantize_kernel(
result_masks = result_masks_y[:, None] & result_masks_x[None, :] result_masks = result_masks_y[:, None] & result_masks_x[None, :]
# Load the weights. # Load the weights.
iweights = tl.load(qweight_ptr + offsets, masks, 0.0) iweights = tl.load(qweight_ptr + offsets, masks)
iweights = tl.interleave(iweights, iweights)
iweights = tl.interleave(iweights, iweights) iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
iweights = tl.interleave(iweights, iweights) iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
# iweights = tl.interleave(iweights, iweights)
# iweights = tl.interleave(iweights, iweights)
# iweights = tl.interleave(iweights, iweights)
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# that will map given indices to the correct order. # that will map given indices to the correct order.
...@@ -73,10 +77,14 @@ def awq_dequantize_kernel( ...@@ -73,10 +77,14 @@ def awq_dequantize_kernel(
zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :] zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
# Load the zeros. # Load the zeros.
zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0) zeros = tl.load(zeros_ptr + zero_offsets, zero_masks)
zeros = tl.interleave(zeros, zeros) # zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros) # zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros) # zeros = tl.interleave(zeros, zeros)
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
# Unpack and reorder: shift out the correct 4-bit value and mask. # Unpack and reorder: shift out the correct 4-bit value and mask.
...@@ -93,7 +101,7 @@ def awq_dequantize_kernel( ...@@ -93,7 +101,7 @@ def awq_dequantize_kernel(
scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :] scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
# Load the scales. # Load the scales.
scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0) scales = tl.load(scales_ptr + scale_offsets, scale_masks)
scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
# Dequantize. # Dequantize.
...@@ -108,19 +116,26 @@ def awq_dequantize_kernel( ...@@ -108,19 +116,26 @@ def awq_dequantize_kernel(
def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
group_size, BLOCK_SIZE_M: tl.constexpr, group_size, BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
SPLIT_K: tl.constexpr): GROUP_SIZE_M: tl.constexpr,SPLIT_K: tl.constexpr):
pid = tl.program_id(axis=0) pid = tl.program_id(axis=0)
pid_z = tl.program_id(1) pid_z = tl.program_id(1)
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead. num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
# num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
if GROUP_SIZE_M == 1:
pid_m = pid // num_pid_n pid_m = pid // num_pid_n
pid_n = pid % num_pid_n pid_n = pid % num_pid_n
else:
accumulator_dtype = c_ptr.type.element_ty num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + (pid % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
# accumulator_dtype = c_ptr.type.element_ty
BLOCK_SIZE_N_8 = BLOCK_SIZE_N // 8
N_8 = N // 8
# NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead. # NOTE: This doesn't work in TRITON_INTERPRET=1 mode. Use below instead.
# accumulator = tl.arange(0, BLOCK_SIZE_N) # accumulator = tl.arange(0, BLOCK_SIZE_N)
# accumulator = tl.broadcast_to(accumulator[None, :], # accumulator = tl.broadcast_to(accumulator[None, :],
...@@ -128,16 +143,16 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, ...@@ -128,16 +143,16 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
# accumulator = accumulator & 0x0 # accumulator = accumulator & 0x0
# accumulator = accumulator.to(accumulator_dtype) # accumulator = accumulator.to(accumulator_dtype)
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
dtype=accumulator_dtype) dtype=tl.float32)
# Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7] # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
# that will map given indices to the correct order. # that will map given indices to the correct order.
reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] + shifts = ((tl.arange(0, 2) * 16)[None, :] +
tl.arange(0, 4)[:, None]).reshape(8) (tl.arange(0, 4) * 4)[:, None]).reshape(1,8)
# Create the necessary shifts to use to unpack. # Create the necessary shifts to use to unpack.
shifts = reverse_awq_order_tensor * 4 # shifts = reverse_awq_order_tensor * 4
shifts = tl.broadcast_to(shifts[None, :], shifts = tl.broadcast_to(shifts,
(BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8)) (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N)) shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
...@@ -145,18 +160,15 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, ...@@ -145,18 +160,15 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
masks_am = offsets_am < M masks_am = offsets_am < M
offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8) offsets_bzn = pid_n * (BLOCK_SIZE_N_8) + tl.arange(0, BLOCK_SIZE_N // 8)
masks_bn = offsets_bn < N // 8 masks_bzn = offsets_bzn < N_8
offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
masks_zn = offsets_zn < N // 8
offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
masks_sn = offsets_sn < N masks_sn = offsets_sn < N
offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
offsets_a = K * offsets_am[:, None] + offsets_k[None, :] offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :] offsets_b = (N_8) * offsets_k[:, None] + offsets_bzn[None, :]
a_ptrs = a_ptr + offsets_a a_ptrs = a_ptr + offsets_a
b_ptrs = b_ptr + offsets_b b_ptrs = b_ptr + offsets_b
...@@ -167,33 +179,40 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, ...@@ -167,33 +179,40 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)): for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
masks_k = offsets_k < K masks_k = offsets_k < K
masks_a = masks_am[:, None] & masks_k[None, :] masks_a = masks_am[:, None] & masks_k[None, :]
a = tl.load(a_ptrs, mask=masks_a, other=0.0) a = tl.load(a_ptrs, mask=masks_a)
masks_b = masks_k[:, None] & masks_bn[None, :] masks_b = masks_k[:, None] & masks_bzn[None, :]
b = tl.load(b_ptrs, mask=masks_b, other=0.0) b = tl.load(b_ptrs, mask=masks_b)
b = tl.interleave(b, b) # b = tl.interleave(b, b)
b = tl.interleave(b, b) # b = tl.interleave(b, b)
b = tl.interleave(b, b) # b = tl.interleave(b, b)
b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
# Dequantize b. # Dequantize b.
offsets_szk = ( offsets_szk = (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size
(BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size + offsets_szk = offsets_szk + (tl.arange(0,BLOCK_SIZE_K) // group_size)
tl.arange(0, 1)) offsets_z = (N_8) * offsets_szk[:, None] + offsets_bzn[None, :]
offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
masks_zk = offsets_szk < K // group_size masks_zk = offsets_szk < K // group_size
masks_z = masks_zk[:, None] & masks_zn[None, :] masks_z = masks_zk[:, None] & masks_bzn[None, :]
zeros_ptrs = zeros_ptr + offsets_z zeros_ptrs = zeros_ptr + offsets_z
zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0) zeros = tl.load(zeros_ptrs, mask=masks_z)
zeros = tl.interleave(zeros, zeros) # zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros) # zeros = tl.interleave(zeros, zeros)
zeros = tl.interleave(zeros, zeros) # zeros = tl.interleave(zeros, zeros)
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N)) zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :] offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
masks_sk = offsets_szk < K // group_size masks_sk = offsets_szk < K // group_size
masks_s = masks_sk[:, None] & masks_sn[None, :] masks_s = masks_sk[:, None] & masks_sn[None, :]
scales_ptrs = scales_ptr + offsets_s scales_ptrs = scales_ptr + offsets_s
scales = tl.load(scales_ptrs, mask=masks_s, other=0.0) scales = tl.load(scales_ptrs, mask=masks_s)
scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
b = (b >> shifts) & 0xF b = (b >> shifts) & 0xF
...@@ -202,18 +221,20 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, ...@@ -202,18 +221,20 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
b = b.to(c_ptr.type.element_ty) b = b.to(c_ptr.type.element_ty)
# Accumulate results. # Accumulate results.
accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype) accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32)
offsets_k += BLOCK_SIZE_K * SPLIT_K offsets_k += BLOCK_SIZE_K * SPLIT_K
a_ptrs += BLOCK_SIZE_K * SPLIT_K a_ptrs += BLOCK_SIZE_K * SPLIT_K
b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8) b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N_8)
c = accumulator.to(c_ptr.type.element_ty) c = accumulator.to(c_ptr.type.element_ty)
offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) c_ptrs = c_ptr + N * offsets_am[:, None] + offsets_sn[None, :]
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) c_mask = masks_am[:, None] & masks_sn[None, :]
c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :] if SPLIT_K == 1:
c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
tl.store(c_ptrs, c, mask=c_mask) tl.store(c_ptrs, c, mask=c_mask)
# tl.store(c_ptrs, c)
else:
tl.atomic_add(c_ptrs, c, mask=c_mask)
# qweights - [K , M // 8], int32 # qweights - [K , M // 8], int32
...@@ -272,9 +293,7 @@ def awq_gemm_triton(input: torch.Tensor, ...@@ -272,9 +293,7 @@ def awq_gemm_triton(input: torch.Tensor,
scales: torch.Tensor, scales: torch.Tensor,
qzeros: torch.Tensor, qzeros: torch.Tensor,
split_k_iters: int, split_k_iters: int,
block_size_m: int = 32, config) -> torch.Tensor:
block_size_n: int = 32,
block_size_k: int = 32) -> torch.Tensor:
M, K = input.shape M, K = input.shape
N = qweight.shape[1] * 8 N = qweight.shape[1] * 8
group_size = qweight.shape[0] // qzeros.shape[0] group_size = qweight.shape[0] // qzeros.shape[0]
...@@ -289,14 +308,16 @@ def awq_gemm_triton(input: torch.Tensor, ...@@ -289,14 +308,16 @@ def awq_gemm_triton(input: torch.Tensor,
assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
grid = lambda META: ( grid = lambda META: (
triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv( triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
N, META['BLOCK_SIZE_N']), META['SPLIT_K'],
split_k_iters,
) )
if config is None:
config= {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8,'SPLIT_K': 8}
if M >256:
#print("INFO:this size not found in json.")
config= {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8,'SPLIT_K': 1}
result = torch.zeros((split_k_iters, M, N), result = torch.zeros((M, N), dtype=scales.dtype, device=input.device)
dtype=scales.dtype,
device=input.device)
# A = input, B = qweight, C = result # A = input, B = qweight, C = result
# A = M x K, B = K x N, C = M x N # A = M x K, B = K x N, C = M x N
...@@ -309,11 +330,5 @@ def awq_gemm_triton(input: torch.Tensor, ...@@ -309,11 +330,5 @@ def awq_gemm_triton(input: torch.Tensor,
N, N,
K, K,
group_size, group_size,
BLOCK_SIZE_M=block_size_m, **config)
BLOCK_SIZE_N=block_size_n,
BLOCK_SIZE_K=block_size_k,
SPLIT_K=split_k_iters)
result = result.sum(0)
return result return result
{
"1536_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"1536_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"3072_1536": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"3072_1536": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4096_512": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4096_512": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4608_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"4608_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"512_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"512_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"576_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"576_7168": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"256": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
{
"7168_2048": {
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"3": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 8,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"5": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"6": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"7": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"9": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"10": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"11": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"12": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"13": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"14": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"15": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 0
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 4,
"num_stages": 1,
"num_warps": 4,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 2,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 8,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 0,
"num_warps": 4,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 8,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 4,
"SPLIT_K": 1,
"num_stages": 1,
"num_warps": 8,
"num_ldmatrixes": 1
}
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment